All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openrdf.rio.turtle.TurtleParser Maven / Gradle / Ivy

/* 
 * Licensed to Aduna under one or more contributor license agreements.  
 * See the NOTICE.txt file distributed with this work for additional 
 * information regarding copyright ownership. 
 *
 * Aduna licenses this file to you under the terms of the Aduna BSD 
 * License (the "License"); you may not use this file except in compliance 
 * with the License. See the LICENSE.txt file distributed with this work 
 * for the full License.
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 * implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */
package org.openrdf.rio.turtle;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.input.BOMInputStream;

import info.aduna.text.ASCIIUtil;

import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.XMLSchema;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RioSetting;
import org.openrdf.rio.helpers.BasicParserSettings;
import org.openrdf.rio.helpers.RDFParserBase;
import org.openrdf.rio.helpers.TurtleParserSettings;

/**
 * RDF parser for Turtle
 * files. This parser is not thread-safe, therefore its public methods are
 * synchronized.
 * 

* This implementation is based on the 2006/01/02 version of the Turtle * specification, with slight deviations: *

    *
  • Normalization of integer, floating point and boolean values is dependent * on the specified datatype handling. According to the specification, integers * and booleans should be normalized, but floats don't.
  • *
  • Comments can be used anywhere in the document, and extend to the end of * the line. The Turtle grammar doesn't allow comments to be used inside triple * constructs that extend over multiple lines, but the author's own parser * deviates from this too.
  • *
  • The localname part of a prefixed named is allowed to start with a number * (cf. the W3C Turtle Working * Draft).
  • *
* * @author Arjohn Kampman */ public class TurtleParser extends RDFParserBase { /*-----------* * Variables * *-----------*/ private PushbackReader reader; protected Resource subject; protected URI predicate; protected Value object; private int lineNumber = 1; /*--------------* * Constructors * *--------------*/ /** * Creates a new TurtleParser that will use a {@link ValueFactoryImpl} to * create RDF model objects. */ public TurtleParser() { super(); } /** * Creates a new TurtleParser that will use the supplied ValueFactory to * create RDF model objects. * * @param valueFactory * A ValueFactory. */ public TurtleParser(ValueFactory valueFactory) { super(valueFactory); } /*---------* * Methods * *---------*/ public RDFFormat getRDFFormat() { return RDFFormat.TURTLE; } @Override public Collection> getSupportedSettings() { Set> result = new HashSet>(super.getSupportedSettings()); result.add(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES); return result; } /** * Implementation of the parse(InputStream, String) method defined * in the RDFParser interface. * * @param in * The InputStream from which to read the data, must not be * null. The InputStream is supposed to contain UTF-8 encoded * Unicode characters, as per the Turtle specification. * @param baseURI * The URI associated with the data in the InputStream, must not be * null. * @throws IOException * If an I/O error occurred while data was read from the InputStream. * @throws RDFParseException * If the parser has found an unrecoverable parse error. * @throws RDFHandlerException * If the configured statement handler encountered an unrecoverable * error. * @throws IllegalArgumentException * If the supplied input stream or base URI is null. */ public synchronized void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException { if (in == null) { throw new IllegalArgumentException("Input stream must not be 'null'"); } // Note: baseURI will be checked in parse(Reader, String) try { parse(new InputStreamReader(new BOMInputStream(in, false), "UTF-8"), baseURI); } catch (UnsupportedEncodingException e) { // Every platform should support the UTF-8 encoding... throw new RuntimeException(e); } } /** * Implementation of the parse(Reader, String) method defined in the * RDFParser interface. * * @param reader * The Reader from which to read the data, must not be null. * @param baseURI * The URI associated with the data in the Reader, must not be * null. * @throws IOException * If an I/O error occurred while data was read from the InputStream. * @throws RDFParseException * If the parser has found an unrecoverable parse error. * @throws RDFHandlerException * If the configured statement handler encountered an unrecoverable * error. * @throws IllegalArgumentException * If the supplied reader or base URI is null. */ public synchronized void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException { if (reader == null) { throw new IllegalArgumentException("Reader must not be 'null'"); } if (baseURI == null) { throw new IllegalArgumentException("base URI must not be 'null'"); } if (rdfHandler != null) { rdfHandler.startRDF(); } // Start counting lines at 1: lineNumber = 1; // Allow at most 8 characters to be pushed back: this.reader = new PushbackReader(reader, 8); // Store normalized base URI setBaseURI(baseURI); reportLocation(); try { int c = skipWSC(); while (c != -1) { parseStatement(); c = skipWSC(); } } finally { clear(); } if (rdfHandler != null) { rdfHandler.endRDF(); } } protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException { StringBuilder sb = new StringBuilder(8); int codePoint; // longest valid directive @prefix do { codePoint = readCodePoint(); if (codePoint == -1 || TurtleUtil.isWhitespace(codePoint)) { unread(codePoint); break; } sb.append(Character.toChars(codePoint)); } while (sb.length() < 8); String directive = sb.toString(); if (directive.startsWith("@") || directive.equalsIgnoreCase("prefix") || directive.equalsIgnoreCase("base")) { parseDirective(directive); skipWSC(); // SPARQL BASE and PREFIX lines do not end in . if (directive.startsWith("@")) { verifyCharacterOrFail(readCodePoint(), "."); } } else { unread(directive); parseTriples(); skipWSC(); verifyCharacterOrFail(readCodePoint(), "."); } } protected void parseDirective(String directive) throws IOException, RDFParseException, RDFHandlerException { if (directive.length() >= 7 && directive.substring(0, 7).equals("@prefix")) { if (directive.length() > 7) { unread(directive.substring(7)); } parsePrefixID(); } else if (directive.length() >= 5 && directive.substring(0, 5).equals("@base")) { if (directive.length() > 5) { unread(directive.substring(5)); } parseBase(); } else if (directive.length() >= 6 && directive.substring(0, 6).equalsIgnoreCase("prefix")) { // SPARQL doesn't require whitespace after directive, so must unread if // we found part of the prefixID if (directive.length() > 6) { unread(directive.substring(6)); } parsePrefixID(); } else if ((directive.length() >= 4 && directive.substring(0, 4).equalsIgnoreCase("base"))) { if (directive.length() > 4) { unread(directive.substring(4)); } parseBase(); } else if (directive.length() >= 7 && directive.substring(0, 7).equalsIgnoreCase("@prefix")) { if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) { reportFatalError("Cannot strictly support case-insensitive @prefix directive in compliance mode."); } if (directive.length() > 7) { unread(directive.substring(7)); } parsePrefixID(); } else if (directive.length() >= 5 && directive.substring(0, 5).equalsIgnoreCase("@base")) { if (!this.getParserConfig().get(TurtleParserSettings.CASE_INSENSITIVE_DIRECTIVES)) { reportFatalError("Cannot strictly support case-insensitive @base directive in compliance mode."); } if (directive.length() > 5) { unread(directive.substring(5)); } parseBase(); } else if (directive.length() == 0) { reportFatalError("Directive name is missing, expected @prefix or @base"); } else { reportFatalError("Unknown directive \"" + directive + "\""); } } protected void parsePrefixID() throws IOException, RDFParseException, RDFHandlerException { skipWSC(); // Read prefix ID (e.g. "rdf:" or ":") StringBuilder prefixID = new StringBuilder(8); while (true) { int c = readCodePoint(); if (c == ':') { unread(c); break; } else if (TurtleUtil.isWhitespace(c)) { break; } else if (c == -1) { throwEOFException(); } prefixID.append(Character.toChars(c)); } skipWSC(); verifyCharacterOrFail(readCodePoint(), ":"); skipWSC(); // Read the namespace URI URI namespace = parseURI(); // Store and report this namespace mapping String prefixStr = prefixID.toString(); String namespaceStr = namespace.toString(); setNamespace(prefixStr, namespaceStr); if (rdfHandler != null) { rdfHandler.handleNamespace(prefixStr, namespaceStr); } } protected void parseBase() throws IOException, RDFParseException, RDFHandlerException { skipWSC(); URI baseURI = parseURI(); setBaseURI(baseURI.toString()); } protected void parseTriples() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); // If the first character is an open bracket we need to decide which of // the two parsing methods for blank nodes to use if (c == '[') { c = readCodePoint(); skipWSC(); c = peekCodePoint(); if (c == ']') { c = readCodePoint(); subject = createBNode(); skipWSC(); parsePredicateObjectList(); } else { unread('['); subject = parseImplicitBlank(); } skipWSC(); c = peekCodePoint(); // if this is not the end of the statement, recurse into the list of // predicate and objects, using the subject parsed above as the subject // of the statement. if (c != '.') { parsePredicateObjectList(); } } else { parseSubject(); skipWSC(); parsePredicateObjectList(); } subject = null; predicate = null; object = null; } protected void parsePredicateObjectList() throws IOException, RDFParseException, RDFHandlerException { predicate = parsePredicate(); skipWSC(); parseObjectList(); while (skipWSC() == ';') { readCodePoint(); int c = skipWSC(); if (c == '.' || // end of triple c == ']' || c == '}') // end of predicateObjectList inside blank // node { break; } else if (c == ';') { // empty predicateObjectList, skip to next continue; } predicate = parsePredicate(); skipWSC(); parseObjectList(); } } protected void parseObjectList() throws IOException, RDFParseException, RDFHandlerException { parseObject(); while (skipWSC() == ',') { readCodePoint(); skipWSC(); parseObject(); } } protected void parseSubject() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); if (c == '(') { subject = parseCollection(); } else if (c == '[') { subject = parseImplicitBlank(); } else { Value value = parseValue(); if (value instanceof Resource) { subject = (Resource)value; } else { reportFatalError("Illegal subject value: " + value); } } } protected URI parsePredicate() throws IOException, RDFParseException, RDFHandlerException { // Check if the short-cut 'a' is used int c1 = readCodePoint(); if (c1 == 'a') { int c2 = readCodePoint(); if (TurtleUtil.isWhitespace(c2)) { // Short-cut is used, return the rdf:type URI return RDF.TYPE; } // Short-cut is not used, unread all characters unread(c2); } unread(c1); // Predicate is a normal resource Value predicate = parseValue(); if (predicate instanceof URI) { return (URI)predicate; } else { reportFatalError("Illegal predicate value: " + predicate); return null; } } protected void parseObject() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); if (c == '(') { object = parseCollection(); } else if (c == '[') { object = parseImplicitBlank(); } else { object = parseValue(); } reportStatement(subject, predicate, object); } /** * Parses a collection, e.g. ( item1 item2 item3 ). */ protected Resource parseCollection() throws IOException, RDFParseException, RDFHandlerException { verifyCharacterOrFail(readCodePoint(), "("); int c = skipWSC(); if (c == ')') { // Empty list readCodePoint(); return RDF.NIL; } else { BNode listRoot = createBNode(); // Remember current subject and predicate Resource oldSubject = subject; URI oldPredicate = predicate; // generated bNode becomes subject, predicate becomes rdf:first subject = listRoot; predicate = RDF.FIRST; parseObject(); BNode bNode = listRoot; while (skipWSC() != ')') { // Create another list node and link it to the previous BNode newNode = createBNode(); reportStatement(bNode, RDF.REST, newNode); // New node becomes the current subject = bNode = newNode; parseObject(); } // Skip ')' readCodePoint(); // Close the list reportStatement(bNode, RDF.REST, RDF.NIL); // Restore previous subject and predicate subject = oldSubject; predicate = oldPredicate; return listRoot; } } /** * Parses an implicit blank node. This method parses the token [] * and predicateObjectLists that are surrounded by square brackets. */ protected Resource parseImplicitBlank() throws IOException, RDFParseException, RDFHandlerException { verifyCharacterOrFail(readCodePoint(), "["); BNode bNode = createBNode(); int c = readCodePoint(); if (c != ']') { unread(c); // Remember current subject and predicate Resource oldSubject = subject; URI oldPredicate = predicate; // generated bNode becomes subject subject = bNode; // Enter recursion with nested predicate-object list skipWSC(); parsePredicateObjectList(); skipWSC(); // Read closing bracket verifyCharacterOrFail(readCodePoint(), "]"); // Restore previous subject and predicate subject = oldSubject; predicate = oldPredicate; } return bNode; } /** * Parses an RDF value. This method parses uriref, qname, node ID, quoted * literal, integer, double and boolean. */ protected Value parseValue() throws IOException, RDFParseException, RDFHandlerException { int c = peekCodePoint(); if (c == '<') { // uriref, e.g. return parseURI(); } else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) { // qname or boolean return parseQNameOrBoolean(); } else if (c == '_') { // node ID, e.g. _:n1 return parseNodeID(); } else if (c == '"' || c == '\'') { // quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo''' return parseQuotedLiteral(); } else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') { // integer or double, e.g. 123 or 1.2e3 return parseNumber(); } else if (c == -1) { throwEOFException(); return null; } else { reportFatalError("Expected an RDF value here, found '" + new String(Character.toChars(c)) + "'"); return null; } } /** * Parses a quoted string, optionally followed by a language tag or datatype. */ protected Literal parseQuotedLiteral() throws IOException, RDFParseException, RDFHandlerException { String label = parseQuotedString(); // Check for presence of a language tag or datatype int c = peekCodePoint(); if (c == '@') { readCodePoint(); // Read language StringBuilder lang = new StringBuilder(8); c = readCodePoint(); if (c == -1) { throwEOFException(); } boolean verifyLanguageTag = getParserConfig().get(BasicParserSettings.VERIFY_LANGUAGE_TAGS); if (verifyLanguageTag && !TurtleUtil.isLanguageStartChar(c)) { reportError("Expected a letter, found '" + new String(Character.toChars(c)) + "'", BasicParserSettings.VERIFY_LANGUAGE_TAGS); } lang.append(Character.toChars(c)); c = readCodePoint(); while (!TurtleUtil.isWhitespace(c)) { // SES-1887 : Flexibility introduced for SES-1985 and SES-1821 needs // to be counterbalanced against legitimate situations where Turtle // language tags do not need whitespace following the language tag if (c == '.' || c == ';' || c == ',' || c == ')' || c == ']' || c == -1) { break; } if (verifyLanguageTag && !TurtleUtil.isLanguageChar(c)) { reportError("Illegal language tag char: '" + new String(Character.toChars(c)) + "'", BasicParserSettings.VERIFY_LANGUAGE_TAGS); } lang.append(Character.toChars(c)); c = readCodePoint(); } unread(c); return createLiteral(label, lang.toString(), null, getLineNumber(), -1); } else if (c == '^') { readCodePoint(); // next character should be another '^' verifyCharacterOrFail(readCodePoint(), "^"); skipWSC(); // Read datatype Value datatype = parseValue(); if (datatype instanceof URI) { return createLiteral(label, null, (URI)datatype, getLineNumber(), -1); } else { reportFatalError("Illegal datatype value: " + datatype); return null; } } else { return createLiteral(label, null, null, getLineNumber(), -1); } } /** * Parses a quoted string, which is either a "normal string" or a """long * string""". */ protected String parseQuotedString() throws IOException, RDFParseException { String result = null; int c1 = readCodePoint(); // First character should be '"' or "'" verifyCharacterOrFail(c1, "\"\'"); // Check for long-string, which starts and ends with three double quotes int c2 = readCodePoint(); int c3 = readCodePoint(); if ((c1 == '"' && c2 == '"' && c3 == '"') || (c1 == '\'' && c2 == '\'' && c3 == '\'')) { // Long string result = parseLongString(c2); } else { // Normal string unread(c3); unread(c2); result = parseString(c1); } // Unescape any escape sequences try { result = TurtleUtil.decodeString(result); } catch (IllegalArgumentException e) { reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES); } return result; } /** * Parses a "normal string". This method requires that the opening character * has already been parsed. */ protected String parseString(int closingCharacter) throws IOException, RDFParseException { StringBuilder sb = new StringBuilder(32); while (true) { int c = readCodePoint(); if (c == closingCharacter) { break; } else if (c == -1) { throwEOFException(); } sb.append(Character.toChars(c)); if (c == '\\') { // This escapes the next character, which might be a '"' c = readCodePoint(); if (c == -1) { throwEOFException(); } sb.append(Character.toChars(c)); } } return sb.toString(); } /** * Parses a """long string""". This method requires that the first three * characters have already been parsed. */ protected String parseLongString(int closingCharacter) throws IOException, RDFParseException { StringBuilder sb = new StringBuilder(1024); int doubleQuoteCount = 0; int c; while (doubleQuoteCount < 3) { c = readCodePoint(); if (c == -1) { throwEOFException(); } else if (c == closingCharacter) { doubleQuoteCount++; } else { doubleQuoteCount = 0; } sb.append(Character.toChars(c)); if (c == '\\') { // This escapes the next character, which might be a '"' c = readCodePoint(); if (c == -1) { throwEOFException(); } sb.append(Character.toChars(c)); } } return sb.substring(0, sb.length() - 3); } protected Literal parseNumber() throws IOException, RDFParseException { StringBuilder value = new StringBuilder(8); URI datatype = XMLSchema.INTEGER; int c = readCodePoint(); // read optional sign character if (c == '+' || c == '-') { value.append(Character.toChars(c)); c = readCodePoint(); } while (ASCIIUtil.isNumber(c)) { value.append(Character.toChars(c)); c = readCodePoint(); } if (c == '.' || c == 'e' || c == 'E') { // read optional fractional digits if (c == '.') { if (TurtleUtil.isWhitespace(peekCodePoint())) { // We're parsing an integer that did not have a space before the // period to end the statement } else { value.append(Character.toChars(c)); c = readCodePoint(); while (ASCIIUtil.isNumber(c)) { value.append(Character.toChars(c)); c = readCodePoint(); } if (value.length() == 1) { // We've only parsed a '.' reportFatalError("Object for statement missing"); } // We're parsing a decimal or a double datatype = XMLSchema.DECIMAL; } } else { if (value.length() == 0) { // We've only parsed an 'e' or 'E' reportFatalError("Object for statement missing"); } } // read optional exponent if (c == 'e' || c == 'E') { datatype = XMLSchema.DOUBLE; value.append(Character.toChars(c)); c = readCodePoint(); if (c == '+' || c == '-') { value.append(Character.toChars(c)); c = readCodePoint(); } if (!ASCIIUtil.isNumber(c)) { reportError("Exponent value missing", BasicParserSettings.VERIFY_DATATYPE_VALUES); } value.append(Character.toChars(c)); c = readCodePoint(); while (ASCIIUtil.isNumber(c)) { value.append(Character.toChars(c)); c = readCodePoint(); } } } // Unread last character, it isn't part of the number unread(c); // String label = value.toString(); // if (datatype.equals(XMLSchema.INTEGER)) { // try { // label = XMLDatatypeUtil.normalizeInteger(label); // } // catch (IllegalArgumentException e) { // // Note: this should never happen because of the parse constraints // reportError("Illegal integer value: " + label); // } // } // return createLiteral(label, null, datatype); // Return result as a typed literal return createLiteral(value.toString(), null, datatype, getLineNumber(), -1); } protected URI parseURI() throws IOException, RDFParseException { StringBuilder uriBuf = new StringBuilder(100); // First character should be '<' int c = readCodePoint(); verifyCharacterOrFail(c, "<"); // Read up to the next '>' character while (true) { c = readCodePoint(); if (c == '>') { break; } else if (c == -1) { throwEOFException(); } if (c == ' ') { reportFatalError("IRI included an unencoded space: '" + c + "'"); } uriBuf.append(Character.toChars(c)); if (c == '\\') { // This escapes the next character, which might be a '>' c = readCodePoint(); if (c == -1) { throwEOFException(); } if (c != 'u' && c != 'U') { reportFatalError("IRI includes string escapes: '\\" + c + "'"); } uriBuf.append(Character.toChars(c)); } } if (c == '.') { reportFatalError("IRI must not end in a '.'"); } String uri = uriBuf.toString(); // Unescape any escape sequences try { // FIXME: The following decodes \n and similar in URIs, which should be // invalid according to test uri = TurtleUtil.decodeString(uri); } catch (IllegalArgumentException e) { reportError(e.getMessage(), BasicParserSettings.VERIFY_DATATYPE_VALUES); } return super.resolveURI(uri); } /** * Parses qnames and boolean values, which have equivalent starting * characters. */ protected Value parseQNameOrBoolean() throws IOException, RDFParseException { // First character should be a ':' or a letter int c = readCodePoint(); if (c == -1) { throwEOFException(); } if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) { reportError("Expected a ':' or a letter, found '" + new String(Character.toChars(c)) + "'", BasicParserSettings.VERIFY_RELATIVE_URIS); } String namespace = null; if (c == ':') { // qname using default namespace namespace = getNamespace(""); } else { // c is the first letter of the prefix StringBuilder prefix = new StringBuilder(8); prefix.append(Character.toChars(c)); int previousChar = c; c = readCodePoint(); while (TurtleUtil.isPrefixChar(c)) { prefix.append(Character.toChars(c)); previousChar = c; c = readCodePoint(); } if (c != ':') { // prefix may actually be a boolean value String value = prefix.toString(); if (value.equals("true") || value.equals("false")) { unread(c); return createLiteral(value, null, XMLSchema.BOOLEAN, getLineNumber(), -1); } } else { if (previousChar == '.') { // '.' is a legal prefix name char, but can not appear at the end reportFatalError("prefix can not end with with '.'"); } } verifyCharacterOrFail(c, ":"); namespace = getNamespace(prefix.toString()); } // c == ':', read optional local name StringBuilder localName = new StringBuilder(16); c = readCodePoint(); if (TurtleUtil.isNameStartChar(c)) { if (c == '\\') { localName.append(readLocalEscapedChar()); } else { localName.append(Character.toChars(c)); } int previousChar = c; c = readCodePoint(); while (TurtleUtil.isNameChar(c)) { if (c == '\\') { localName.append(readLocalEscapedChar()); } else { localName.append(Character.toChars(c)); } previousChar = c; c = readCodePoint(); } // Unread last character unread(c); if (previousChar == '.') { // '.' is a legal name char, but can not appear at the end, so is // not actually part of the name unread(previousChar); localName.deleteCharAt(localName.length() - 1); } } else { // Unread last character unread(c); } String localNameString = localName.toString(); for (int i = 0; i < localNameString.length(); i++) { if (localNameString.charAt(i) == '%') { if (i > localNameString.length() - 3 || !ASCIIUtil.isHex(localNameString.charAt(i + 1)) || !ASCIIUtil.isHex(localNameString.charAt(i + 2))) { reportFatalError("Found incomplete percent-encoded sequence: " + localNameString); } } } // if (c == '.') { // reportFatalError("Blank node identifier must not end in a '.'"); // } // Note: namespace has already been resolved return createURI(namespace + localNameString); } private char readLocalEscapedChar() throws RDFParseException, IOException { int c = readCodePoint(); if (TurtleUtil.isLocalEscapedChar(c)) { return (char)c; } else { throw new RDFParseException("found '" + new String(Character.toChars(c)) + "', expected one of: " + Arrays.toString(TurtleUtil.LOCAL_ESCAPED_CHARS)); } } /** * Parses a blank node ID, e.g. _:node1. */ protected BNode parseNodeID() throws IOException, RDFParseException { // Node ID should start with "_:" verifyCharacterOrFail(readCodePoint(), "_"); verifyCharacterOrFail(readCodePoint(), ":"); // Read the node ID int c = readCodePoint(); if (c == -1) { throwEOFException(); } else if (!TurtleUtil.isBLANK_NODE_LABEL_StartChar(c)) { reportError("Expected a letter, found '" + (char)c + "'", BasicParserSettings.PRESERVE_BNODE_IDS); } StringBuilder name = new StringBuilder(32); name.append(Character.toChars(c)); // Read all following letter and numbers, they are part of the name c = readCodePoint(); // If we would never go into the loop we must unread now if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) { unread(c); } while (TurtleUtil.isBLANK_NODE_LABEL_Char(c)) { int previous = c; c = readCodePoint(); if (previous == '.' && (c == -1 || TurtleUtil.isWhitespace(c) || c == '<' || c == '_')) { unread(c); unread(previous); break; } name.append((char)previous); if (!TurtleUtil.isBLANK_NODE_LABEL_Char(c)) { unread(c); } } return createBNode(name.toString()); } protected void reportStatement(Resource subj, URI pred, Value obj) throws RDFParseException, RDFHandlerException { Statement st = createStatement(subj, pred, obj); if (rdfHandler != null) { rdfHandler.handleStatement(st); } } /** * Verifies that the supplied character code point codePoint is one * of the expected characters specified in expected. This method * will throw a ParseException if this is not the case. */ protected void verifyCharacterOrFail(int codePoint, String expected) throws RDFParseException { if (codePoint == -1) { throwEOFException(); } final String supplied = new String(Character.toChars(codePoint)); if (expected.indexOf(supplied) == -1) { StringBuilder msg = new StringBuilder(32); msg.append("Expected "); for (int i = 0; i < expected.length(); i++) { if (i > 0) { msg.append(" or "); } msg.append('\''); msg.append(expected.charAt(i)); msg.append('\''); } msg.append(", found '"); msg.append(supplied); msg.append("'"); reportFatalError(msg.toString()); } } /** * Consumes any white space characters (space, tab, line feed, newline) and * comments (#-style) from reader. After this method has been * called, the first character that is returned by reader is either * a non-ignorable character, or EOF. For convenience, this character is also * returned by this method. * * @return The next character code point that will be returned by * reader. */ protected int skipWSC() throws IOException, RDFHandlerException { int c = readCodePoint(); while (TurtleUtil.isWhitespace(c) || c == '#') { if (c == '#') { processComment(); } else if (c == '\n') { // we only count line feeds (LF), not carriage return (CR), as // normally a CR is immediately followed by a LF. lineNumber++; } c = readCodePoint(); } unread(c); return c; } /** * Consumes characters from reader until the first EOL has been read. This * line of text is then passed to the {@link #rdfHandler} as a comment. */ protected void processComment() throws IOException, RDFHandlerException { StringBuilder comment = new StringBuilder(64); int c = readCodePoint(); while (c != -1 && c != 0xD && c != 0xA) { comment.append(Character.toChars(c)); c = readCodePoint(); } // c is equal to -1, \r or \n. // In case c is equal to \r, we should also read a following \n. if (c == 0xD) { c = readCodePoint(); if (c != 0xA) { unread(c); } } if (rdfHandler != null) { rdfHandler.handleComment(comment.toString()); } reportLocation(); } /** * Reads the next Unicode code point. * * @return the next Unicode code point, or -1 if the end of the stream has * been reached. * @throws IOException */ protected int readCodePoint() throws IOException { int next = reader.read(); if (Character.isHighSurrogate((char)next)) { next = Character.toCodePoint((char)next, (char)reader.read()); } return next; } /** * Pushes back a single code point by copying it to the front of the buffer. * After this method returns, a call to {@link #readCodePoint()} will return * the same code point c again. * * @param codePoint * a single Unicode code point. * @throws IOException */ protected void unread(int codePoint) throws IOException { if (codePoint != -1) { if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); } else { reader.unread(codePoint); } } } /** * Pushes back the supplied string by copying it to the front of the buffer. * After this method returns, successive calls to {@link #readCodePoint()} * will return the code points in the supplied string again, starting at the * first in the String.. * * @param string * the string to un-read. * @throws IOException */ protected void unread(String string) throws IOException { for (int i = string.codePointCount(0, string.length()); i >= 1; i--) { final int codePoint = string.codePointBefore(i); if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); } else { reader.unread(codePoint); } } } /** * Peeks at the next Unicode code point without advancing the reader, and * returns its value. * * @return the next Unicode code point, or -1 if the end of the stream has * been reached. * @throws IOException */ protected int peekCodePoint() throws IOException { int result = readCodePoint(); unread(result); return result; } protected void reportLocation() { reportLocation(getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number * information to the error. */ @Override protected void reportWarning(String msg) { reportWarning(msg, getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportError(String, RioSetting)}, adding * line number information to the error. */ @Override protected void reportError(String msg, RioSetting setting) throws RDFParseException { reportError(msg, getLineNumber(), -1, setting); } /** * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line * number information to the error. */ @Override protected void reportFatalError(String msg) throws RDFParseException { reportFatalError(msg, getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line * number information to the error. */ @Override protected void reportFatalError(Exception e) throws RDFParseException { reportFatalError(e, getLineNumber(), -1); } protected void throwEOFException() throws RDFParseException { throw new RDFParseException("Unexpected end of file"); } private int getLineNumber() { return lineNumber; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy