org.openrdf.rio.ntriples.NTriplesParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sesame-rio-ntriples Show documentation
Rio parser and writer implementation for the N-Triples file format.
There is a newer version: 4.1.2
/* 
 * Licensed to Aduna under one or more contributor license agreements.  
 * See the NOTICE.txt file distributed with this work for additional 
 * information regarding copyright ownership. 
 *
 * Aduna licenses this file to you under the terms of the Aduna BSD 
 * License (the "License"); you may not use this file except in compliance 
 * with the License. See the LICENSE.txt file distributed with this work 
 * for the full License.
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 * implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */
package org.openrdf.rio.ntriples;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;

import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RioSetting;
import org.openrdf.rio.helpers.BasicParserSettings;
import org.openrdf.rio.helpers.NTriplesParserSettings;
import org.openrdf.rio.helpers.RDFParserBase;

/**
 * RDF parser for N-Triples files. A specification of NTriples can be found in
 * this section of
 * the RDF Test Cases document. This parser is not thread-safe, therefore its
 * public methods are synchronized.
 * 
 * @author Arjohn Kampman
 */
public class NTriplesParser extends RDFParserBase {

	/*-----------*
	 * Variables *
	 *-----------*/

	protected Reader reader;

	protected int lineNo;

	protected Resource subject;

	protected URI predicate;

	protected Value object;

	/*--------------*
	 * Constructors *
	 *--------------*/

	/**
	 * Creates a new NTriplesParser that will use a {@link ValueFactoryImpl} to
	 * create object for resources, bNodes and literals.
	 */
	public NTriplesParser() {
		super();
	}

	/**
	 * Creates a new NTriplesParser that will use the supplied
	 * ValueFactory to create RDF model objects.
	 * 
	 * @param valueFactory
	 *        A ValueFactory.
	 */
	public NTriplesParser(ValueFactory valueFactory) {
		super(valueFactory);
	}

	/*---------*
	 * Methods *
	 *---------*/

	// implements RDFParser.getRDFFormat()
	public RDFFormat getRDFFormat() {
		return RDFFormat.NTRIPLES;
	}

	/**
	 * Implementation of the parse(InputStream, String) method defined
	 * in the RDFParser interface.
	 * 
	 * @param in
	 *        The InputStream from which to read the data, must not be
	 *        null. The InputStream is supposed to contain 7-bit
	 *        US-ASCII characters, as per the N-Triples specification.
	 * @param baseURI
	 *        The URI associated with the data in the InputStream, must not be
	 *        null.
	 * @throws IOException
	 *         If an I/O error occurred while data was read from the InputStream.
	 * @throws RDFParseException
	 *         If the parser has found an unrecoverable parse error.
	 * @throws RDFHandlerException
	 *         If the configured statement handler encountered an unrecoverable
	 *         error.
	 * @throws IllegalArgumentException
	 *         If the supplied input stream or base URI is null.
	 */
	public synchronized void parse(InputStream in, String baseURI)
		throws IOException, RDFParseException, RDFHandlerException
	{
		if (in == null) {
			throw new IllegalArgumentException("Input stream can not be 'null'");
		}
		// Note: baseURI will be checked in parse(Reader, String)

		try {
			parse(new InputStreamReader(in, "US-ASCII"), baseURI);
		}
		catch (UnsupportedEncodingException e) {
			// Every platform should support the US-ASCII encoding...
			throw new RuntimeException(e);
		}
	}

	/**
	 * Implementation of the parse(Reader, String) method defined in the
	 * RDFParser interface.
	 * 
	 * @param reader
	 *        The Reader from which to read the data, must not be null.
	 * @param baseURI
	 *        The URI associated with the data in the Reader, must not be
	 *        null.
	 * @throws IOException
	 *         If an I/O error occurred while data was read from the InputStream.
	 * @throws RDFParseException
	 *         If the parser has found an unrecoverable parse error.
	 * @throws RDFHandlerException
	 *         If the configured statement handler encountered an unrecoverable
	 *         error.
	 * @throws IllegalArgumentException
	 *         If the supplied reader or base URI is null.
	 */
	public synchronized void parse(Reader reader, String baseURI)
		throws IOException, RDFParseException, RDFHandlerException
	{
		if (reader == null) {
			throw new IllegalArgumentException("Reader can not be 'null'");
		}
		if (baseURI == null) {
			throw new IllegalArgumentException("base URI can not be 'null'");
		}

		rdfHandler.startRDF();

		this.reader = reader;
		lineNo = 1;

		reportLocation(lineNo, 1);

		try {
			int c = reader.read();
			c = skipWhitespace(c);

			while (c != -1) {
				if (c == '#') {
					// Comment, ignore
					c = skipLine(c);
				}
				else if (c == '\r' || c == '\n') {
					// Empty line, ignore
					c = skipLine(c);
				}
				else {
					c = parseTriple(c);
				}

				c = skipWhitespace(c);
			}
		}
		finally {
			clear();
		}

		rdfHandler.endRDF();
	}

	/**
	 * Reads characters from reader until it finds a character that is not a
	 * space or tab, and returns this last character. In case the end of the
	 * character stream has been reached, -1 is returned.
	 */
	protected int skipWhitespace(int c)
		throws IOException
	{
		while (c == ' ' || c == '\t') {
			c = reader.read();
		}

		return c;
	}

	/**
	 * Verifies that there is only whitespace until the end of the line.
	 */
	protected int assertLineTerminates(int c)
		throws IOException, RDFParseException
	{
		c = reader.read();

		c = skipWhitespace(c);

		if (c != -1 && c != '\r' && c != '\n') {
			reportFatalError("Content after '.' is not allowed");
		}

		return c;
	}

	/**
	 * Reads characters from reader until the first EOL has been read. The first
	 * character after the EOL is returned. In case the end of the character
	 * stream has been reached, -1 is returned.
	 */
	protected int skipLine(int c)
		throws IOException
	{
		while (c != -1 && c != '\r' && c != '\n') {
			c = reader.read();
		}

		// c is equal to -1, \r or \n. In case of a \r, we should
		// check whether it is followed by a \n.

		if (c == '\n') {
			c = reader.read();

			lineNo++;

			reportLocation(lineNo, 1);
		}
		else if (c == '\r') {
			c = reader.read();

			if (c == '\n') {
				c = reader.read();
			}

			lineNo++;

			reportLocation(lineNo, 1);
		}

		return c;
	}

	private int parseTriple(int c)
		throws IOException, RDFParseException, RDFHandlerException
	{
		boolean ignoredAnError = false;
		try {
			c = parseSubject(c);

			c = skipWhitespace(c);

			c = parsePredicate(c);

			c = skipWhitespace(c);

			c = parseObject(c);

			c = skipWhitespace(c);

			if (c == -1) {
				throwEOFException();
			}
			else if (c != '.') {
				reportError("Expected '.', found: " + (char)c,
						NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
			}

			c = assertLineTerminates(c);
		}
		catch (RDFParseException rdfpe) {
			if (getParserConfig().get(NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES)
					&& getParserConfig().isNonFatalError(NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES))
			{
				ignoredAnError = true;
			}
			else {
				throw rdfpe;
			}
		}

		c = skipLine(c);

		if (!ignoredAnError) {
			Statement st = createStatement(subject, predicate, object);
			rdfHandler.handleStatement(st);
		}

		subject = null;
		predicate = null;
		object = null;

		return c;
	}

	protected int parseSubject(int c)
		throws IOException, RDFParseException
	{
		StringBuilder sb = new StringBuilder(100);

		// subject is either an uriref () or a nodeID (_:node1)
		if (c == '<') {
			// subject is an uriref
			c = parseUriRef(c, sb);
			subject = createURI(sb.toString());
		}
		else if (c == '_') {
			// subject is a bNode
			c = parseNodeID(c, sb);
			subject = createBNode(sb.toString());
		}
		else if (c == -1) {
			throwEOFException();
		}
		else {
			reportFatalError("Expected '<' or '_', found: " + (char)c);
		}

		return c;
	}

	protected int parsePredicate(int c)
		throws IOException, RDFParseException
	{
		StringBuilder sb = new StringBuilder(100);

		// predicate must be an uriref ()
		if (c == '<') {
			// predicate is an uriref
			c = parseUriRef(c, sb);
			predicate = createURI(sb.toString());
		}
		else if (c == -1) {
			throwEOFException();
		}
		else {
			reportFatalError("Expected '<', found: " + (char)c);
		}

		return c;
	}

	protected int parseObject(int c)
		throws IOException, RDFParseException
	{
		StringBuilder sb = getBuffer();

		// object is either an uriref (), a nodeID (_:node1) or a
		// literal ("foo"-en or "1"^^).
		if (c == '<') {
			// object is an uriref
			c = parseUriRef(c, sb);
			object = createURI(sb.toString());
		}
		else if (c == '_') {
			// object is a bNode
			c = parseNodeID(c, sb);
			object = createBNode(sb.toString());
		}
		else if (c == '"') {
			// object is a literal
			StringBuilder lang = getLanguageTagBuffer();
			StringBuilder datatype = getDatatypeUriBuffer();
			c = parseLiteral(c, sb, lang, datatype);
			object = createLiteral(sb.toString(), lang.toString(), datatype.toString());
		}
		else if (c == -1) {
			throwEOFException();
		}
		else {
			reportFatalError("Expected '<', '_' or '\"', found: " + (char)c);
		}

		return c;
	}

	protected int parseUriRef(int c, StringBuilder uriRef)
		throws IOException, RDFParseException
	{
		assert c == '<' : "Supplied char should be a '<', is: " + c;

		// Read up to the next '>' character
		c = reader.read();
		while (c != '>') {
			if (c == -1) {
				throwEOFException();
			}
			uriRef.append((char)c);
			c = reader.read();
		}

		// c == '>', read next char
		c = reader.read();

		return c;
	}

	protected int parseNodeID(int c, StringBuilder name)
		throws IOException, RDFParseException
	{
		assert c == '_' : "Supplied char should be a '_', is: " + c;

		c = reader.read();
		if (c == -1) {
			throwEOFException();
		}
		else if (c != ':') {
			reportError("Expected ':', found: " + (char)c, NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
		}

		c = reader.read();
		if (c == -1) {
			throwEOFException();
		}
		else if (!NTriplesUtil.isLetter(c)) {
			reportError("Expected a letter, found: " + (char)c,
					NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
		}
		name.append((char)c);

		// Read all following letter and numbers, they are part of the name
		c = reader.read();
		while (c != -1 && NTriplesUtil.isLetterOrNumber(c)) {
			name.append((char)c);
			c = reader.read();
		}

		return c;
	}

	private int parseLiteral(int c, StringBuilder value, StringBuilder lang, StringBuilder datatype)
		throws IOException, RDFParseException
	{
		assert c == '"' : "Supplied char should be a '\"', is: " + c;

		// Read up to the next '"' character
		c = reader.read();
		while (c != '"') {
			if (c == -1) {
				throwEOFException();
			}
			value.append((char)c);

			if (c == '\\') {
				// This escapes the next character, which might be a double quote
				c = reader.read();
				if (c == -1) {
					throwEOFException();
				}
				value.append((char)c);
			}

			c = reader.read();
		}

		// c == '"', read next char
		c = reader.read();

		if (c == '@') {
			// Read language
			c = reader.read();
			while (c != -1 && c != '.' && c != '^' && c != ' ' && c != '\t') {
				lang.append((char)c);
				c = reader.read();
			}
		}
		else if (c == '^') {
			// Read datatype
			c = reader.read();

			// c should be another '^'
			if (c == -1) {
				throwEOFException();
			}
			else if (c != '^') {
				reportError("Expected '^', found: " + (char)c,
						NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
			}

			c = reader.read();

			// c should be a '<'
			if (c == -1) {
				throwEOFException();
			}
			else if (c != '<') {
				reportError("Expected '<', found: " + (char)c,
						NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
			}

			c = parseUriRef(c, datatype);
		}

		return c;
	}

	@Override
	protected URI createURI(String uri)
		throws RDFParseException
	{
		try {
			uri = NTriplesUtil.unescapeString(uri);
		}
		catch (IllegalArgumentException e) {
			reportError(e.getMessage(), NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
		}

		return super.createURI(uri);
	}

	protected Literal createLiteral(String label, String lang, String datatype)
		throws RDFParseException
	{
		try {
			label = NTriplesUtil.unescapeString(label);
		}
		catch (IllegalArgumentException e) {
			reportError(e.getMessage(), NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);
		}

		if (lang.length() == 0) {
			lang = null;
		}

		if (datatype.length() == 0) {
			datatype = null;
		}

		URI dtURI = null;
		if (datatype != null) {
			dtURI = createURI(datatype);
		}

		return super.createLiteral(label, lang, dtURI);
	}

	/**
	 * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
	 * information to the error.
	 */
	@Override
	protected void reportWarning(String msg) {
		reportWarning(msg, lineNo, -1);
	}

	/**
	 * Overrides {@link RDFParserBase#reportError(String)}, adding line number
	 * information to the error.
	 */
	@Override
	protected void reportError(String msg, RioSetting setting)
		throws RDFParseException
	{
		reportError(msg, lineNo, -1, setting);
	}

	/**
	 * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
	 * number information to the error.
	 */
	@Override
	protected void reportFatalError(String msg)
		throws RDFParseException
	{
		reportFatalError(msg, lineNo, -1);
	}

	/**
	 * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
	 * number information to the error.
	 */
	@Override
	protected void reportFatalError(Exception e)
		throws RDFParseException
	{
		reportFatalError(e, lineNo, -1);
	}

	protected void throwEOFException()
		throws RDFParseException
	{
		throw new RDFParseException("Unexpected end of file");
	}

	/**
	 * Return a buffer of zero length and non-zero capacity. The same buffer is
	 * reused for each thing which is parsed. This reduces the heap churn
	 * substantially. However, you have to watch out for side-effects and convert
	 * the buffer to a {@link String} before the buffer is reused.
	 * 
	 * @param capacityIsIgnored
	 * @return
	 */
	private StringBuilder getBuffer() {
		buffer.setLength(0);
		return buffer;
	}

	private final StringBuilder buffer = new StringBuilder(100);

	/**
	 * Return a buffer for the use of parsing literal language tags. The buffer
	 * is of zero length and non-zero capacity. The same buffer is reused for
	 * each tag which is parsed. This reduces the heap churn substantially.
	 * However, you have to watch out for side-effects and convert the buffer to
	 * a {@link String} before the buffer is reused.
	 * 
	 * @param capacityIsIgnored
	 * @return
	 */
	private StringBuilder getLanguageTagBuffer() {
		languageTagBuffer.setLength(0);
		return languageTagBuffer;
	}

	private final StringBuilder languageTagBuffer = new StringBuilder(8);

	/**
	 * Return a buffer for the use of parsing literal datatype URIs. The buffer
	 * is of zero length and non-zero capacity. The same buffer is reused for
	 * each datatype which is parsed. This reduces the heap churn substantially.
	 * However, you have to watch out for side-effects and convert the buffer to
	 * a {@link String} before the buffer is reused.
	 * 
	 * @param capacityIsIgnored
	 * @return
	 */
	private StringBuilder getDatatypeUriBuffer() {
		datatypeUriBuffer.setLength(0);
		return datatypeUriBuffer;
	}

	private final StringBuilder datatypeUriBuffer = new StringBuilder(40);

	@Override
	protected void clear() {
		super.clear();
		// get rid of anything large left in the buffers.
		buffer.setLength(0);
		buffer.trimToSize();
		languageTagBuffer.setLength(0);
		languageTagBuffer.trimToSize();
		datatypeUriBuffer.setLength(0);
		datatypeUriBuffer.trimToSize();
	}

	/*
	 * N-Triples parser supports these settings.
	 */
	@Override
	public Collection> getSupportedSettings() {
		Collection> result = new HashSet>(super.getSupportedSettings());

		// Very few parsers support stop at first error, so it is not enabled in
		// RDFParserBase
		result.add(NTriplesParserSettings.IGNORE_NTRIPLES_INVALID_LINES);

		return result;
	}

}