com.bigdata.rdf.rio.ntriples.BigdataNTriplesParser Maven / Gradle / Ivy
/*
* Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2007.
*
* Licensed under the Aduna BSD-style license.
*/
package com.bigdata.rdf.rio.ntriples;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.commons.io.input.BOMInputStream;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RioSetting;
import org.openrdf.rio.helpers.NTriplesParserSettings;
import org.openrdf.rio.helpers.RDFParserBase;
import org.openrdf.rio.ntriples.NTriplesUtil;
import com.bigdata.rdf.model.BigdataBNode;
import com.bigdata.rdf.model.BigdataStatement;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
/**
* RDF parser for N-Triples files. A specification of NTriples can be found in
* this section of
* the RDF Test Cases document. This parser is not thread-safe, therefore its
* public methods are synchronized.
*
* This parser has been modified to support the inline notation for statements
* about statements.
*
* This parser has been modified to reuse the same {@link StringBuilder} in
* order to minimize heap churn.
*
* This parser has been modified to permit "-" and "_" in blank node IDs (they
* are not allowed in that position for NTRIPLES). This was done to support a
* demonstration use case. That change could (and should) be backed out. It is
* documented by FIXMEs in the code. One of the test files would also have to be
* fixed.
*
* @author Arjohn Kampman
* @author Bryan Thompson
* @openrdf
*/
public class BigdataNTriplesParser extends RDFParserBase {
/*-----------*
* Variables *
*-----------*/
private PushbackReader reader;
private int lineNo;
private ValueFactory valueFactory;
/**
* LRU collection of embedded statements and their associated blank nodes.
*/
private Map sids = new LinkedHashMap() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
protected boolean removeEldestEntry(
final Map.Entry eldest) {
return size() > 100;
}
};
static private class State {
private Resource subject;
private URI predicate;
private Value object;
/**
* The SID corresponding to the most recently parsed embedded statement.
*/
private BigdataBNode lastSID;
};
private final Stack stack = new Stack();
private void push(final State state) {
stack.add(state);
}
private State pop() {
return stack.pop();
}
private State peek() {
return stack.peek();
}
// /**
// * Return a buffer of zero length and non-zero capacity. The same buffer is
// * reused for each thing which is parsed. This reduces the heap churn
// * substantially. However, you have to watch out for side-effects and
// * convert the buffer to a {@link String} before the buffer is reused.
// *
// * @param capacityIsIgnored
//
// * @return
// */
// private StringBuilder getBuffer() {
// buffer.setLength(0);
// return buffer;
// }
//
// private final StringBuilder buffer = new StringBuilder(100);
//
// private StringBuilder getLanguageTagBuffer() {
// languageTagBuffer.setLength(0);
// return languageTagBuffer;
// }
//
// private final StringBuilder languageTagBuffer = new StringBuilder(8);
//
// private StringBuilder getDatatypeUriBuffer() {
// datatypeUriBuffer.setLength(0);
// return datatypeUriBuffer;
// }
//
// private final StringBuilder datatypeUriBuffer = new StringBuilder(40);
/*--------------*
* Constructors *
*--------------*/
/**
* Creates a new NTriplesParser that will use a {@link ValueFactoryImpl} to
* create object for resources, bNodes and literals.
*/
public BigdataNTriplesParser() {
// We are providing Bigdata-specific value factory to support parsing of RDR,
// which require BigdataValueFactory instead of default Sesame implementation
// See https://jira.blazegraph.com/browse/BLZG-1322
super(BigdataValueFactoryImpl.getInstance(""));
}
/**
* Creates a new NTriplesParser that will use the supplied
* ValueFactory to create RDF model objects.
*
* @param valueFactory
* A ValueFactory.
*/
public BigdataNTriplesParser(BigdataValueFactory valueFactory) {
super(valueFactory);
}
public void setValueFactory(final ValueFactory valueFactory) {
super.setValueFactory(valueFactory);
this.valueFactory = valueFactory;
}
/**
* Return the {@link BigdataValueFactory}.
*
* @throws ClassCastException
* if you have not set a {@link BigdataValueFactory}.
*/
protected BigdataValueFactory getValueFactory() {
return (BigdataValueFactory) valueFactory;
}
/*---------*
* Methods *
*---------*/
// implements RDFParser.getRDFFormat()
public final RDFFormat getRDFFormat() {
return RDFFormat.NTRIPLES;
}
/**
* Implementation of the parse(InputStream, String) method defined
* in the RDFParser interface.
*
* @param in
* The InputStream from which to read the data, must not be
* null. The InputStream is supposed to contain 7-bit
* US-ASCII characters, as per the N-Triples specification.
* @param baseURI
* The URI associated with the data in the InputStream, must not be
* null.
* @throws IOException
* If an I/O error occurred while data was read from the InputStream.
* @throws RDFParseException
* If the parser has found an unrecoverable parse error.
* @throws RDFHandlerException
* If the configured statement handler encountered an unrecoverable
* error.
* @throws IllegalArgumentException
* If the supplied input stream or base URI is null.
*/
@Override
public synchronized void parse(InputStream in, String baseURI)
throws IOException, RDFParseException, RDFHandlerException
{
if (in == null) {
throw new IllegalArgumentException("Input stream can not be 'null'");
}
// Note: baseURI will be checked in parse(Reader, String)
try {
parse(new InputStreamReader(new BOMInputStream(in, false), "US-ASCII"), baseURI);
}
catch (UnsupportedEncodingException e) {
// Every platform should support the US-ASCII encoding...
throw new RuntimeException(e);
}
}
/**
* Implementation of the parse(Reader, String) method defined in
* the RDFParser interface.
*
* @param reader
* The Reader from which to read the data, must not be null.
* @param baseURI
* The URI associated with the data in the Reader, must not be
* null.
* @throws IOException
* If an I/O error occurred while data was read from the InputStream.
* @throws RDFParseException
* If the parser has found an unrecoverable parse error.
* @throws RDFHandlerException
* If the configured statement handler encountered an unrecoverable
* error.
* @throws IllegalArgumentException
* If the supplied reader or base URI is null.
*/
public synchronized void parse(final Reader reader, final String baseURI)
throws IOException, RDFParseException, RDFHandlerException
{
if (reader == null) {
throw new IllegalArgumentException("Reader can not be 'null'");
}
if (baseURI == null) {
throw new IllegalArgumentException("base URI can not be 'null'");
}
rdfHandler.startRDF();
// We need pushback for '<<' versus '<'.
this.reader = new PushbackReader(reader, 1/* size */);
lineNo = 1;
reportLocation(lineNo, 1);
push(new State());
try {
int c = reader.read();
c = skipWhitespace(c);
while (c != -1) {
if (c == '#') {
// Comment, ignore
c = skipLine(c);
}
else if (c == '\r' || c == '\n') {
// Empty line, ignore
c = skipLine(c);
}
else {
c = parseTriple(c, false/* embedded */);
}
c = skipWhitespace(c);
}
}
finally {
clear();
}
rdfHandler.endRDF();
}
/**
* Reads characters from reader until it finds a character that is not a
* space or tab, and returns this last character. In case the end of the
* character stream has been reached, -1 is returned.
*/
protected int skipWhitespace(int c)
throws IOException
{
while (c == ' ' || c == '\t') {
c = reader.read();
}
return c;
}
/**
* Verifies that there is only whitespace until the end of the line.
*/
protected int assertLineTerminates(int c)
throws IOException, RDFParseException
{
c = reader.read();
c = skipWhitespace(c);
if (c != -1 && c != '\r' && c != '\n') {
reportFatalError("Content after '.' is not allowed");
}
return c;
}
/**
* Reads characters from reader until the first EOL has been read. The first
* character after the EOL is returned. In case the end of the character
* stream has been reached, -1 is returned.
*/
protected int skipLine(int c)
throws IOException
{
while (c != -1 && c != '\r' && c != '\n') {
c = reader.read();
}
// c is equal to -1, \r or \n. In case of a \r, we should
// check whether it is followed by a \n.
if (c == '\n') {
c = reader.read();
lineNo++;
reportLocation(lineNo, 1);
}
else if (c == '\r') {
c = reader.read();
if (c == '\n') {
c = reader.read();
}
lineNo++;
reportLocation(lineNo, 1);
}
return c;
}
private int parseTriple(int c,final boolean embedded)
throws IOException, RDFParseException, RDFHandlerException
{
c = parseSubject(c);
c = skipWhitespace(c);
c = parsePredicate(c);
c = skipWhitespace(c);
c = parseObject(c);
c = skipWhitespace(c);
if (c == -1) {
throwEOFException();
}
else if(embedded) {
// Embedded.
if (c != '>')
reportFatalError("Expected '>', found: " + (char) c);
c = reader.read();
if (c != '>')
reportFatalError("Expected '>', found: " + (char) c);
// eat the >> and then skip to the next whitespace on the
// same line.
c = skipWhitespace(reader.read());
} else {
// Non-embedded.
if (c != '.') {
reportFatalError("Expected '.', found: " + (char) c);
}
c = skipLine(c);
}
final State state = peek();
if (embedded) {
// Create statement.
BigdataStatement st = (BigdataStatement) createStatement(
state.subject, state.predicate, state.object);
// add the RDR statement inside the << >>.
rdfHandler.handleStatement(st);
state.lastSID = ((BigdataValueFactory) valueFactory).createBNode(st);
// // Resolve against LRU map to blank node for statement.
// BigdataBNode sid = sids.get(st);
// if (sid != null) {
// state.lastSID = sid;
// } else {
// /*
// * Not found.
// *
// * TODO The use of the sid bnode in the context position should
// * go away when we migrate to sids support in both triples and
// * quads mode.
// */
// // New blank node for "sid" of this statement.
// state.lastSID = sid = (BigdataBNode) createBNode();
// // New statement using that "sid" as its context position.
// st = getValueFactory().createStatement(state.subject,
// state.predicate, state.object, sid);
// // cache it.
// sids.put(st,sid);
// // mark this blank node as a "sid".
// // st.setStatementIdentifier(true);
// ((BigdataBNodeImpl) sid).setStatement(st);
// // new statement so pass to the call back interface.
// rdfHandler.handleStatement(st);
// }
} else {
// simple statement (original code path).
final Statement st = createStatement(
state.subject, state.predicate, state.object);
rdfHandler.handleStatement(st);
}
// state.clear();
// subject = null;
// predicate = null;
// object = null;
return c;
}
/**
* Return true
if the next character is <. This should only
* be invoked when the current character is known to be <. It provides
* one character lookahead to differentiate between a URI and a Statement.
* For example, an embedded Statement in the subject position of another
* statement looks like this:
*
*
* << "no"@en>> "288" .
*
*/
private boolean isStatement(int c) throws RDFParseException, IOException {
assert c == '<' : "Supplied char should be a '<', is: " + c;
c = reader.read();
if (c == -1) {
throwEOFException();
}
reader.unread(c);
return c == '<';
}
private int parseSubject(int c)
throws IOException, RDFParseException, RDFHandlerException
{
final State state = peek();
// subject is either an uriref () or a nodeID (_:node1)
// OR a Statement.
if (c == '<') {
if (isStatement(c)) {
// Embedded statement.
c = reader.read(); // known '<'
if (c != '<')
reportFatalError("Expected '<', found: " + (char) c);
// have '<<', so this is an embedded statement.
c = reader.read(); // next character.
c = skipWhitespace(c); // skip any WS characters.
push(new State());
c = parseTriple(c, true/* embedded */);
state.subject = pop().lastSID;
} else {
// subject is an uriref
final StringBuilder sb = getBuffer();
c = parseUriRef(c, sb);
state.subject = createURI(sb.toString());
}
}
else if (c == '_') {
// subject is a bNode
final StringBuilder sb = getBuffer();
c = parseNodeID(c, sb);
state.subject = createBNode(sb.toString());
}
else if (c == -1) {
throwEOFException();
}
else {
reportFatalError("Expected '<' or '_', found: " + (char)c);
}
return c;
}
private int parsePredicate(int c)
throws IOException, RDFParseException
{
// predicate must be an uriref ()
if (c == '<') {
// predicate is an uriref
final StringBuilder sb = getBuffer();
c = parseUriRef(c, sb);
peek().predicate = createURI(sb.toString());
}
else if (c == -1) {
throwEOFException();
}
else {
reportFatalError("Expected '<', found: " + (char)c);
}
return c;
}
private int parseObject(int c)
throws IOException, RDFParseException, RDFHandlerException
{
final State state = peek();
// object is either an uriref (), a nodeID (_:node1) or a
// literal ("foo"-en or "1"^^).
// OR a Statement
if (c == '<') {
if (isStatement(c)) {
// Embedded statement.
c = reader.read(); // known '<'
if (c != '<')
reportFatalError("Expected '<', found: " + (char) c);
// have '<<', so this is an embedded statement.
c = reader.read(); // next character.
c = skipWhitespace(c); // skip any WS characters.
push(new State());
c = parseTriple(c, true/* embedded */);
state.object = pop().lastSID;
} else {
// object is an uriref
final StringBuilder sb = getBuffer();
c = parseUriRef(c, sb);
state.object = createURI(sb.toString());
}
}
else if (c == '_') {
// object is a bNode
final StringBuilder sb = getBuffer();
c = parseNodeID(c, sb);
state.object = createBNode(sb.toString());
}
else if (c == '"') {
// object is a literal
final StringBuilder sb = getBuffer();
final StringBuilder lang = getLanguageTagBuffer();
final StringBuilder datatype = getDatatypeUriBuffer();
c = parseLiteral(c, sb, lang, datatype);
state.object = createLiteral(sb.toString(), lang.toString(), datatype.toString());
}
else if (c == -1) {
throwEOFException();
}
else {
reportFatalError("Expected '<', '_' or '\"', found: " + (char)c);
}
return c;
}
private int parseUriRef(int c, StringBuilder uriRef)
throws IOException, RDFParseException
{
assert c == '<' : "Supplied char should be a '<', is: " + c;
// Read up to the next '>' character
c = reader.read();
while (c != '>') {
if (c == -1) {
throwEOFException();
}
uriRef.append((char)c);
c = reader.read();
}
// c == '>', read next char
c = reader.read();
return c;
}
private int parseNodeID(int c, StringBuilder name)
throws IOException, RDFParseException
{
assert c == '_' : "Supplied char should be a '_', is: " + c;
c = reader.read();
if (c == -1) {
throwEOFException();
}
else if (c != ':') {
reportError("Expected ':', found: " + (char)c, NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
}
c = reader.read();
if (c == -1) {
throwEOFException();
}
// modified to allow fully numeric bnode ids
//// else if (!NTriplesUtil.isLetter(c)) {
// else if (!/*NTriplesUtil.*/isLetter(c)) {
// reportError("Expected a letter, found: " + (char)c);
// }
name.append((char)c);
// Read all following letter and numbers, they are part of the name
c = reader.read();
// while (c != -1 && NTriplesUtil.isLetterOrNumber(c)) {
while (c != -1 && /*NTriplesUtil.*/isLetterOrNumber(c)) {
name.append((char)c);
c = reader.read();
}
return c;
}
/**
* Checks whether the supplied character is a letter or number according to
* the N-Triples specification.
*
* @see #isLetter
* @see NTriplesUtil#isLetterOrNumber(int)
*/
public static boolean isLetterOrNumber(int c) {
return isLetter(c) || NTriplesUtil.isNumber(c);
}
/**
* Checks whether the supplied character is a letter according to
* the N-Triples specification. N-Triples letters are A - Z and a - z.
*
* @see NTriplesUtil#isLetter(int)
*/
private static boolean isLetter(int c) {
return (c >= 65 && c <= 90) || // A - Z
(c >= 97 && c <= 122) || // a - z
(EXPANDED_LETTERS && (c == '_' || c == '-'));
}
/** FIXME This is Hacked to allow both "_" and "-" in a bnode name. */
private static final boolean EXPANDED_LETTERS = true;
private int parseLiteral(int c, final StringBuilder value,
final StringBuilder lang, final StringBuilder datatype)
throws IOException, RDFParseException
{
assert c == '"' : "Supplied char should be a '\"', is: " + c;
// Read up to the next '"' character
c = reader.read();
while (c != '"') {
if (c == -1) {
throwEOFException();
}
value.append((char)c);
if (c == '\\') {
// This escapes the next character, which might be a double quote
c = reader.read();
if (c == -1) {
throwEOFException();
}
value.append((char)c);
}
c = reader.read();
}
// c == '"', read next char
c = reader.read();
if (c == '@') {
// Read language
c = reader.read();
while (c != -1 && c != '.' && c != '^' && c != ' ' && c != '\t'
&& c != '>' // End of Statement about Statement.
) {
lang.append((char)c);
c = reader.read();
}
}
else if (c == '^') {
// Read datatype
c = reader.read();
// c should be another '^'
if (c == -1) {
throwEOFException();
}
else if (c != '^') {
reportError("Expected '^', found: " + (char)c,
NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
}
c = reader.read();
// c should be a '<'
if (c == -1) {
throwEOFException();
}
else if (c != '<') {
reportError("Expected '<', found: " + (char)c,
NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
}
c = parseUriRef(c, datatype);
}
return c;
}
@Override
protected URI createURI(String uri)
throws RDFParseException
{
try {
uri = NTriplesUtil.unescapeString(uri);
}
catch (IllegalArgumentException e) {
reportError(e.getMessage(), NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
}
return super.createURI(uri);
}
protected Literal createLiteral(String label, String lang, String datatype)
throws RDFParseException
{
try {
label = NTriplesUtil.unescapeString(label);
}
catch (IllegalArgumentException e) {
reportFatalError(e.getMessage());
}
if (lang.length() == 0) {
lang = null;
}
if (datatype.length() == 0) {
datatype = null;
}
URI dtURI = null;
if (datatype != null) {
dtURI = createURI(datatype);
}
return super.createLiteral(label, lang, dtURI);
}
/**
* Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
* information to the error.
*/
@Override
protected void reportWarning(String msg)
{
reportWarning(msg, lineNo, -1);
}
/**
* Overrides {@link RDFParserBase#reportError(String)}, adding line number
* information to the error.
*/
@Override
protected void reportError(String msg, RioSetting setting)
throws RDFParseException
{
reportError(msg, lineNo, -1, setting);
}
protected void reportError(Exception e, RioSetting setting)
throws RDFParseException
{
reportError(e, lineNo, -1, setting);
}
/**
* Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
* number information to the error.
*/
@Override
protected void reportFatalError(String msg)
throws RDFParseException
{
reportFatalError(msg, lineNo, -1);
}
/**
* Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
* number information to the error.
*/
@Override
protected void reportFatalError(Exception e)
throws RDFParseException
{
reportFatalError(e, lineNo, -1);
}
private void throwEOFException()
throws RDFParseException
{
throw new RDFParseException("Unexpected end of file");
}
/**
* Return a buffer of zero length and non-zero capacity. The same buffer is
* reused for each thing which is parsed. This reduces the heap churn
* substantially. However, you have to watch out for side-effects and convert
* the buffer to a {@link String} before the buffer is reused.
*
* @param capacityIsIgnored
* @return
*/
private StringBuilder getBuffer() {
buffer.setLength(0);
return buffer;
}
private final StringBuilder buffer = new StringBuilder(100);
/**
* Return a buffer for the use of parsing literal language tags. The buffer
* is of zero length and non-zero capacity. The same buffer is reused for
* each tag which is parsed. This reduces the heap churn substantially.
* However, you have to watch out for side-effects and convert the buffer to
* a {@link String} before the buffer is reused.
*
* @param capacityIsIgnored
* @return
*/
private StringBuilder getLanguageTagBuffer() {
languageTagBuffer.setLength(0);
return languageTagBuffer;
}
private final StringBuilder languageTagBuffer = new StringBuilder(8);
/**
* Return a buffer for the use of parsing literal datatype URIs. The buffer
* is of zero length and non-zero capacity. The same buffer is reused for
* each datatype which is parsed. This reduces the heap churn substantially.
* However, you have to watch out for side-effects and convert the buffer to
* a {@link String} before the buffer is reused.
*
* @param capacityIsIgnored
* @return
*/
private StringBuilder getDatatypeUriBuffer() {
datatypeUriBuffer.setLength(0);
return datatypeUriBuffer;
}
private final StringBuilder datatypeUriBuffer = new StringBuilder(40);
// @Override
// protected void clear() {
// super.clear();
// // get rid of anything large left in the buffers.
// buffer.setLength(0);
// buffer.trimToSize();
// languageTagBuffer.setLength(0);
// languageTagBuffer.trimToSize();
// datatypeUriBuffer.setLength(0);
// datatypeUriBuffer.trimToSize();
// }
}