com.bigdata.rdf.rio.ntriples.BigdataNTriplesParser Maven / Gradle / Ivy

Go to download
/*
 * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2007.
 *
 * Licensed under the Aduna BSD-style license.
 */
package com.bigdata.rdf.rio.ntriples;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Stack;

import org.apache.commons.io.input.BOMInputStream;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RioSetting;
import org.openrdf.rio.helpers.NTriplesParserSettings;
import org.openrdf.rio.helpers.RDFParserBase;
import org.openrdf.rio.ntriples.NTriplesUtil;

import com.bigdata.rdf.model.BigdataBNode;
import com.bigdata.rdf.model.BigdataStatement;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;

/**
 * RDF parser for N-Triples files. A specification of NTriples can be found in
 * this section of
 * the RDF Test Cases document. This parser is not thread-safe, therefore its
 * public methods are synchronized.
 * 
 * This parser has been modified to support the inline notation for statements
 * about statements.
 * 

 * This parser has been modified to reuse the same {@link StringBuilder} in
 * order to minimize heap churn.
 * 

 * This parser has been modified to permit "-" and "_" in blank node IDs (they
 * are not allowed in that position for NTRIPLES). This was done to support a
 * demonstration use case. That change could (and should) be backed out. It is
 * documented by FIXMEs in the code. One of the test files would also have to be
 * fixed.
 * 
 * @author Arjohn Kampman
 * @author Bryan Thompson
 * @openrdf
 */
public class BigdataNTriplesParser extends RDFParserBase {

	/*-----------*
	 * Variables *
	 *-----------*/

	private PushbackReader reader;

	private int lineNo;

	private ValueFactory valueFactory;
	
	/**
	 * LRU collection of embedded statements and their associated blank nodes.
	 */
	private Map sids = new LinkedHashMap() {
		/**
		 * 
		 */
		private static final long serialVersionUID = 1L;

		@Override
		protected boolean removeEldestEntry(
				final Map.Entry eldest) {

			return size() > 100;
			
		}
	};
	
	static private class State {

		private Resource subject;

		private URI predicate;

		private Value object;
		
		/**
		 * The SID corresponding to the most recently parsed embedded statement.
		 */
		private BigdataBNode lastSID;
		
	};
	
	private final Stack stack = new Stack();

	private void push(final State state) {
		stack.add(state);
	}
	private State pop() {
		return stack.pop();
	}
	private State peek() {
		return stack.peek();
	}
	
//	/**
//	 * Return a buffer of zero length and non-zero capacity. The same buffer is
//	 * reused for each thing which is parsed. This reduces the heap churn
//	 * substantially. However, you have to watch out for side-effects and
//	 * convert the buffer to a {@link String} before the buffer is reused.
//	 * 
//	 * @param capacityIsIgnored
//
//	 * @return
//	 */
//	private StringBuilder getBuffer() {
//		buffer.setLength(0);
//		return buffer;
//	}
//
//	private final StringBuilder buffer = new StringBuilder(100);
//
//	private StringBuilder getLanguageTagBuffer() {
//		languageTagBuffer.setLength(0);
//		return languageTagBuffer;
//	}
//
//	private final StringBuilder languageTagBuffer = new StringBuilder(8);
//
//	private StringBuilder getDatatypeUriBuffer() {
//		datatypeUriBuffer.setLength(0);
//		return datatypeUriBuffer;
//	}
//
//	private final StringBuilder datatypeUriBuffer = new StringBuilder(40);
	
	/*--------------*
	 * Constructors *
	 *--------------*/

	/**
	 * Creates a new NTriplesParser that will use a {@link ValueFactoryImpl} to
	 * create object for resources, bNodes and literals.
	 */
	public BigdataNTriplesParser() {
		// We are providing Bigdata-specific value factory to support parsing of RDR,
		// which require BigdataValueFactory instead of default Sesame implementation
		// See https://jira.blazegraph.com/browse/BLZG-1322
		super(BigdataValueFactoryImpl.getInstance(""));
	}

	/**
	 * Creates a new NTriplesParser that will use the supplied
	 * ValueFactory to create RDF model objects.
	 * 
	 * @param valueFactory
	 *        A ValueFactory.
	 */
	public BigdataNTriplesParser(BigdataValueFactory valueFactory) {
		super(valueFactory);
	}

	public void setValueFactory(final ValueFactory valueFactory) {
		super.setValueFactory(valueFactory);
		this.valueFactory = valueFactory;
	}

	/**
	 * Return the {@link BigdataValueFactory}.
	 * 
	 * @throws ClassCastException
	 *             if you have not set a {@link BigdataValueFactory}.
	 */
	protected BigdataValueFactory getValueFactory() {
		return (BigdataValueFactory) valueFactory;
	}

	/*---------*
	 * Methods *
	 *---------*/

	// implements RDFParser.getRDFFormat()
	public final RDFFormat getRDFFormat() {
		return RDFFormat.NTRIPLES;
	}

    /**
     * Implementation of the parse(InputStream, String) method defined
     * in the RDFParser interface.
     * 
     * @param in
     *        The InputStream from which to read the data, must not be
     *        null. The InputStream is supposed to contain 7-bit
     *        US-ASCII characters, as per the N-Triples specification.
     * @param baseURI
     *        The URI associated with the data in the InputStream, must not be
     *        null.
     * @throws IOException
     *         If an I/O error occurred while data was read from the InputStream.
     * @throws RDFParseException
     *         If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *         If the configured statement handler encountered an unrecoverable
     *         error.
     * @throws IllegalArgumentException
     *         If the supplied input stream or base URI is null.
     */
    @Override
    public synchronized void parse(InputStream in, String baseURI)
        throws IOException, RDFParseException, RDFHandlerException
    {
        if (in == null) {
            throw new IllegalArgumentException("Input stream can not be 'null'");
        }
        // Note: baseURI will be checked in parse(Reader, String)

        try {
            parse(new InputStreamReader(new BOMInputStream(in, false), "US-ASCII"), baseURI);
        }
        catch (UnsupportedEncodingException e) {
            // Every platform should support the US-ASCII encoding...
            throw new RuntimeException(e);
        }
    }

    /**
     * Implementation of the parse(Reader, String) method defined in
     * the RDFParser interface.
     * 
     * @param reader
     *        The Reader from which to read the data, must not be null.
     * @param baseURI
     *        The URI associated with the data in the Reader, must not be
     *        null.
     * @throws IOException
     *         If an I/O error occurred while data was read from the InputStream.
     * @throws RDFParseException
     *         If the parser has found an unrecoverable parse error.
     * @throws RDFHandlerException
     *         If the configured statement handler encountered an unrecoverable
     *         error.
     * @throws IllegalArgumentException
     *         If the supplied reader or base URI is null.
     */
    public synchronized void parse(final Reader reader, final String baseURI)
        throws IOException, RDFParseException, RDFHandlerException
    {
        if (reader == null) {
            throw new IllegalArgumentException("Reader can not be 'null'");
        }
        if (baseURI == null) {
            throw new IllegalArgumentException("base URI can not be 'null'");
        }

        rdfHandler.startRDF();

        // We need pushback for '<<' versus '<'.
        this.reader = new PushbackReader(reader, 1/* size */);
        lineNo = 1;

        reportLocation(lineNo, 1);
        push(new State());
        try {
            int c = reader.read();
            c = skipWhitespace(c);

            while (c != -1) {
                if (c == '#') {
                    // Comment, ignore
                    c = skipLine(c);
                }
                else if (c == '\r' || c == '\n') {
                    // Empty line, ignore
                    c = skipLine(c);
                }
                else {
                    c = parseTriple(c, false/* embedded */);
                }

                c = skipWhitespace(c);
            }
        }
        finally {
            clear();
        }

        rdfHandler.endRDF();
    }

    /**
     * Reads characters from reader until it finds a character that is not a
     * space or tab, and returns this last character. In case the end of the
     * character stream has been reached, -1 is returned.
     */
    protected int skipWhitespace(int c)
        throws IOException
    {
        while (c == ' ' || c == '\t') {
            c = reader.read();
        }

        return c;
    }

    /**
     * Verifies that there is only whitespace until the end of the line.
     */
    protected int assertLineTerminates(int c)
        throws IOException, RDFParseException
    {
        c = reader.read();

        c = skipWhitespace(c);

        if (c != -1 && c != '\r' && c != '\n') {
            reportFatalError("Content after '.' is not allowed");
        }

        return c;
    }

    /**
     * Reads characters from reader until the first EOL has been read. The first
     * character after the EOL is returned. In case the end of the character
     * stream has been reached, -1 is returned.
     */
    protected int skipLine(int c)
        throws IOException
    {
        while (c != -1 && c != '\r' && c != '\n') {
            c = reader.read();
        }

        // c is equal to -1, \r or \n. In case of a \r, we should
        // check whether it is followed by a \n.

        if (c == '\n') {
            c = reader.read();

            lineNo++;

            reportLocation(lineNo, 1);
        }
        else if (c == '\r') {
            c = reader.read();

            if (c == '\n') {
                c = reader.read();
            }

            lineNo++;

            reportLocation(lineNo, 1);
        }

        return c;
    }

	private int parseTriple(int c,final boolean embedded)
		throws IOException, RDFParseException, RDFHandlerException
	{
		c = parseSubject(c);

		c = skipWhitespace(c);

		c = parsePredicate(c);

		c = skipWhitespace(c);

		c = parseObject(c);

		c = skipWhitespace(c);

		if (c == -1) {
			throwEOFException();
		}
		else if(embedded) {
			// Embedded.
			if (c != '>')
				reportFatalError("Expected '>', found: " + (char) c);
			c = reader.read();
			if (c != '>')
				reportFatalError("Expected '>', found: " + (char) c);
			// eat the >> and then skip to the next whitespace on the
			// same line.
			c = skipWhitespace(reader.read());
		} else {
			// Non-embedded.
			if (c != '.') {
				reportFatalError("Expected '.', found: " + (char) c);
			}
			c = skipLine(c);
		}

		final State state = peek();
		if (embedded) {
			// Create statement.
			BigdataStatement st = (BigdataStatement) createStatement(
					state.subject, state.predicate, state.object);
			
			// add the RDR statement inside the << >>.
			rdfHandler.handleStatement(st);
			
			state.lastSID = ((BigdataValueFactory) valueFactory).createBNode(st);
			
//			// Resolve against LRU map to blank node for statement.
//			BigdataBNode sid = sids.get(st);
//			if (sid != null) {
//				state.lastSID = sid;
//			} else {
//				/*
//				 * Not found.
//				 * 
//				 * TODO The use of the sid bnode in the context position should
//				 * go away when we migrate to sids support in both triples and
//				 * quads mode.
//				 */
//				// New blank node for "sid" of this statement.
//				state.lastSID = sid = (BigdataBNode) createBNode();
//				// New statement using that "sid" as its context position.
//				st = getValueFactory().createStatement(state.subject,
//						state.predicate, state.object, sid);
//				// cache it.
//				sids.put(st,sid);
//				// mark this blank node as a "sid".
//				// st.setStatementIdentifier(true);
//				((BigdataBNodeImpl) sid).setStatement(st);
//				// new statement so pass to the call back interface.
//				rdfHandler.handleStatement(st);
//			}
			
		} else {
			// simple statement (original code path).
			final Statement st = createStatement(
					state.subject, state.predicate, state.object);
			rdfHandler.handleStatement(st);
		}

//		state.clear();
//		subject = null;
//		predicate = null;
//		object = null;

		return c;
	}

	/**
	 * Return true if the next character is <. This should only
	 * be invoked when the current character is known to be <. It provides
	 * one character lookahead to differentiate between a URI and a Statement.
	 * For example, an embedded Statement in the subject position of another
	 * statement looks like this:
	 * 
	 * 
	 * <<  "no"@en>>  "288" .
	 * 
	 */
	private boolean isStatement(int c) throws RDFParseException, IOException {
		assert c == '<' : "Supplied char should be a '<', is: " + c;
		c = reader.read();
		if (c == -1) {
			throwEOFException();
		}
		reader.unread(c);
		return c == '<';
	}
	
	private int parseSubject(int c)
		throws IOException, RDFParseException, RDFHandlerException
	{
		final State state = peek();
		// subject is either an uriref () or a nodeID (_:node1)
		// OR a Statement.
		if (c == '<') {
			if (isStatement(c)) {
				// Embedded statement.
				c = reader.read(); // known '<'
				if (c != '<')
					reportFatalError("Expected '<', found: " + (char) c);
				// have '<<', so this is an embedded statement.
				c = reader.read(); // next character.
				c = skipWhitespace(c); // skip any WS characters.
				push(new State());
				c = parseTriple(c, true/* embedded */);
				state.subject = pop().lastSID;
			} else {
				// subject is an uriref
				final StringBuilder sb = getBuffer();
				c = parseUriRef(c, sb);
				state.subject = createURI(sb.toString());
			}
		}
		else if (c == '_') {
			// subject is a bNode
			final StringBuilder sb = getBuffer();
			c = parseNodeID(c, sb);
			state.subject = createBNode(sb.toString());
		}
		else if (c == -1) {
			throwEOFException();
		}
		else {
			reportFatalError("Expected '<' or '_', found: " + (char)c);
		}

		return c;
	}

	private int parsePredicate(int c)
		throws IOException, RDFParseException
	{
		// predicate must be an uriref ()
		if (c == '<') {
			// predicate is an uriref
			final StringBuilder sb = getBuffer();
			c = parseUriRef(c, sb);
			peek().predicate = createURI(sb.toString());
		}
		else if (c == -1) {
			throwEOFException();
		}
		else {
			reportFatalError("Expected '<', found: " + (char)c);
		}

		return c;
	}

	private int parseObject(int c)
		throws IOException, RDFParseException, RDFHandlerException
	{

		final State state = peek();
		// object is either an uriref (), a nodeID (_:node1) or a
		// literal ("foo"-en or "1"^^).
		// OR a Statement
		if (c == '<') {
			if (isStatement(c)) {
				// Embedded statement.
				c = reader.read(); // known '<'
				if (c != '<')
					reportFatalError("Expected '<', found: " + (char) c);
				// have '<<', so this is an embedded statement.
				c = reader.read(); // next character.
				c = skipWhitespace(c); // skip any WS characters.
				push(new State());
				c = parseTriple(c, true/* embedded */);
				state.object = pop().lastSID;
			} else {
				// object is an uriref
				final StringBuilder sb = getBuffer();
				c = parseUriRef(c, sb);
				state.object = createURI(sb.toString());
			}
		}
		else if (c == '_') {
			// object is a bNode
			final StringBuilder sb = getBuffer();
			c = parseNodeID(c, sb);
			state.object = createBNode(sb.toString());
		}
		else if (c == '"') {
			// object is a literal
			final StringBuilder sb = getBuffer();
			final StringBuilder lang = getLanguageTagBuffer();
			final StringBuilder datatype = getDatatypeUriBuffer();
			c = parseLiteral(c, sb, lang, datatype);
			state.object = createLiteral(sb.toString(), lang.toString(), datatype.toString());
		}
		else if (c == -1) {
			throwEOFException();
		}
		else {
			reportFatalError("Expected '<', '_' or '\"', found: " + (char)c);
		}

		return c;
	}

	private int parseUriRef(int c, StringBuilder uriRef)
		throws IOException, RDFParseException
	{
		assert c == '<' : "Supplied char should be a '<', is: " + c;

		// Read up to the next '>' character
		c = reader.read();
		while (c != '>') {
			if (c == -1) {
				throwEOFException();
			}
			uriRef.append((char)c);
			c = reader.read();
		}

		// c == '>', read next char
		c = reader.read();

		return c;
	}

	private int parseNodeID(int c, StringBuilder name)
		throws IOException, RDFParseException
	{
		assert c == '_' : "Supplied char should be a '_', is: " + c;

		c = reader.read();
		if (c == -1) {
			throwEOFException();
		}
		else if (c != ':') {
			reportError("Expected ':', found: " + (char)c, NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
		}

		c = reader.read();
		if (c == -1) {
			throwEOFException();
		}
//      modified to allow fully numeric bnode ids 
////		else if (!NTriplesUtil.isLetter(c)) {
//		else if (!/*NTriplesUtil.*/isLetter(c)) { 
//			reportError("Expected a letter, found: " + (char)c);
//		}
		name.append((char)c);

		// Read all following letter and numbers, they are part of the name
		c = reader.read();
//		while (c != -1 && NTriplesUtil.isLetterOrNumber(c)) {
		while (c != -1 && /*NTriplesUtil.*/isLetterOrNumber(c)) {
			name.append((char)c);
			c = reader.read();
		}

		return c;
	}
	
	/**
	 * Checks whether the supplied character is a letter or number according to
	 * the N-Triples specification.
	 * 
	 * @see #isLetter
	 * @see NTriplesUtil#isLetterOrNumber(int)
	 */
	public static boolean isLetterOrNumber(int c) {
		return isLetter(c) || NTriplesUtil.isNumber(c);
	}

	/** 
	 * Checks whether the supplied character is a letter according to
	 * the N-Triples specification. N-Triples letters are A - Z and a - z.
	 * 
	 * @see NTriplesUtil#isLetter(int)
	 */
	private static boolean isLetter(int c) {
		return (c >= 65 && c <= 90) || // A - Z
				(c >= 97 && c <= 122) || // a - z
				(EXPANDED_LETTERS && (c == '_' || c == '-'));
	}

	/** FIXME This is Hacked to allow both "_" and "-" in a bnode name. */
	private static final boolean EXPANDED_LETTERS = true;

	private int parseLiteral(int c, final StringBuilder value,
			final StringBuilder lang, final StringBuilder datatype)
			throws IOException, RDFParseException
	{
		assert c == '"' : "Supplied char should be a '\"', is: " + c;

		// Read up to the next '"' character
		c = reader.read();
		while (c != '"') {
			if (c == -1) {
				throwEOFException();
			}
			value.append((char)c);

			if (c == '\\') {
				// This escapes the next character, which might be a double quote
				c = reader.read();
				if (c == -1) {
					throwEOFException();
				}
				value.append((char)c);
			}

			c = reader.read();
		}

		// c == '"', read next char
		c = reader.read();

		if (c == '@') {
			// Read language
			c = reader.read();
			while (c != -1 && c != '.' && c != '^' && c != ' ' && c != '\t'
					&& c != '>' // End of Statement about Statement.
					) {
				lang.append((char)c);
				c = reader.read();
			}
		}
		else if (c == '^') {
			// Read datatype
			c = reader.read();

			// c should be another '^'
			if (c == -1) {
				throwEOFException();
			}
			else if (c != '^') {
				reportError("Expected '^', found: " + (char)c,
				        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
			}

			c = reader.read();

			// c should be a '<'
			if (c == -1) {
				throwEOFException();
			}
			else if (c != '<') {
				reportError("Expected '<', found: " + (char)c,
				        NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
			}

			c = parseUriRef(c, datatype);
		}

		return c;
	}

	@Override
	protected URI createURI(String uri)
		throws RDFParseException
	{
		try {
			uri = NTriplesUtil.unescapeString(uri);
		}
		catch (IllegalArgumentException e) {
			reportError(e.getMessage(),  NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
		}

		return super.createURI(uri);
	}

	protected Literal createLiteral(String label, String lang, String datatype)
		throws RDFParseException
	{
		try {
			label = NTriplesUtil.unescapeString(label);
		}
		catch (IllegalArgumentException e) {
		    reportFatalError(e.getMessage());
		}

		if (lang.length() == 0) {
			lang = null;
		}

		if (datatype.length() == 0) {
			datatype = null;
		}

		URI dtURI = null;
		if (datatype != null) {
			dtURI = createURI(datatype);
		}

		return super.createLiteral(label, lang, dtURI);
	}

	/**
	 * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
	 * information to the error.
	 */
	@Override
	protected void reportWarning(String msg)
	{
		reportWarning(msg, lineNo, -1);
	}

    /**
     * Overrides {@link RDFParserBase#reportError(String)}, adding line number
     * information to the error.
     */
    @Override
    protected void reportError(String msg, RioSetting setting)
        throws RDFParseException
    {
        reportError(msg, lineNo, -1, setting);
    }

    protected void reportError(Exception e, RioSetting setting)
        throws RDFParseException
    {
        reportError(e, lineNo, -1, setting);
    }

	/**
	 * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
	 * number information to the error.
	 */
	@Override
	protected void reportFatalError(String msg)
		throws RDFParseException
	{
		reportFatalError(msg, lineNo, -1);
	}

	/**
	 * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
	 * number information to the error.
	 */
	@Override
	protected void reportFatalError(Exception e)
		throws RDFParseException
	{
		reportFatalError(e, lineNo, -1);
	}

	private void throwEOFException()
		throws RDFParseException
	{
		throw new RDFParseException("Unexpected end of file");
	}
	

	/**
	 * Return a buffer of zero length and non-zero capacity. The same buffer is
	 * reused for each thing which is parsed. This reduces the heap churn
	 * substantially. However, you have to watch out for side-effects and convert
	 * the buffer to a {@link String} before the buffer is reused.
	 * 
	 * @param capacityIsIgnored
	 * @return
	 */
	private StringBuilder getBuffer() {
		buffer.setLength(0);
		return buffer;
	}

	private final StringBuilder buffer = new StringBuilder(100);

	/**
	 * Return a buffer for the use of parsing literal language tags. The buffer
	 * is of zero length and non-zero capacity. The same buffer is reused for
	 * each tag which is parsed. This reduces the heap churn substantially.
	 * However, you have to watch out for side-effects and convert the buffer to
	 * a {@link String} before the buffer is reused.
	 * 
	 * @param capacityIsIgnored
	 * @return
	 */
	private StringBuilder getLanguageTagBuffer() {
		languageTagBuffer.setLength(0);
		return languageTagBuffer;
	}

	private final StringBuilder languageTagBuffer = new StringBuilder(8);

	/**
	 * Return a buffer for the use of parsing literal datatype URIs. The buffer
	 * is of zero length and non-zero capacity. The same buffer is reused for
	 * each datatype which is parsed. This reduces the heap churn substantially.
	 * However, you have to watch out for side-effects and convert the buffer to
	 * a {@link String} before the buffer is reused.
	 * 
	 * @param capacityIsIgnored
	 * @return
	 */
	private StringBuilder getDatatypeUriBuffer() {
		datatypeUriBuffer.setLength(0);
		return datatypeUriBuffer;
	}

	private final StringBuilder datatypeUriBuffer = new StringBuilder(40);

//	@Override
//	protected void clear() {
//		super.clear();
//		// get rid of anything large left in the buffers.
//		buffer.setLength(0);
//		buffer.trimToSize();
//		languageTagBuffer.setLength(0);
//		languageTagBuffer.trimToSize();
//		datatypeUriBuffer.setLength(0);
//		datatypeUriBuffer.trimToSize();
//	}

}