All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eclipse.rdf4j.rio.ntriples.NTriplesParser Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Distribution License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *******************************************************************************/
package org.eclipse.rdf4j.rio.ntriples;

import static org.eclipse.rdf4j.rio.helpers.NTriplesUtil.unescapeString;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashSet;

import org.apache.commons.io.input.BOMInputStream;
import org.eclipse.rdf4j.common.text.ASCIIUtil;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
import org.eclipse.rdf4j.rio.helpers.NTriplesParserSettings;
import org.eclipse.rdf4j.rio.helpers.NTriplesUtil;

/**
 * RDF parser for N-Triples files. A specification of NTriples can be found in
 * this section of the RDF Test Cases document. This parser
 * is not thread-safe, therefore its public methods are synchronized.
 *
 * @author Arjohn Kampman
 */
public class NTriplesParser extends AbstractRDFParser {

	protected BufferedReader reader;
	protected char[] lineChars;
	protected int currentIndex;
	protected long lineNo;
	protected Resource subject;
	protected IRI predicate;
	protected Value object;

	/**
	 * Creates a new NTriplesParser that will use a {@link SimpleValueFactory} to create object for resources, bNodes
	 * and literals.
	 */
	public NTriplesParser() {
		super();
	}

	/**
	 * Creates a new NTriplesParser that will use the supplied ValueFactory to create RDF model objects.
	 *
	 * @param valueFactory A ValueFactory.
	 */
	public NTriplesParser(ValueFactory valueFactory) {
		super(valueFactory);
	}

	@Override
	public RDFFormat getRDFFormat() {
		return RDFFormat.NTRIPLES;
	}

	@Override
	public synchronized void parse(InputStream in, String baseURI)
			throws IOException, RDFParseException, RDFHandlerException {
		if (in == null) {
			throw new IllegalArgumentException("Input stream can not be 'null'");
		}

		try {
			parse(new BufferedReader(new InputStreamReader(new BOMInputStream(in, false), StandardCharsets.UTF_8)),
					baseURI);
		} catch (UnsupportedEncodingException e) {
			// Every platform should support the UTF-8 encoding...
			throw new RuntimeException(e);
		}
	}

	@Override
	public synchronized void parse(Reader reader, String baseURI)
			throws IOException, RDFParseException, RDFHandlerException {
		clear();

		try {
			if (reader == null) {
				throw new IllegalArgumentException("Reader can not be 'null'");
			}

			if (rdfHandler != null) {
				rdfHandler.startRDF();
			}

			if (reader instanceof BufferedReader) {
				this.reader = (BufferedReader) reader;
			} else {
				this.reader = new BufferedReader(reader);
			}
			lineNo = 0;

			reportLocation(lineNo, 1);

			while (readLine()) {
				parseStatement();
			}
		} finally {
			clear();
		}

		if (rdfHandler != null) {
			rdfHandler.endRDF();
		}
	}

	protected void parseStatement() throws RDFParseException, RDFHandlerException {
		boolean ignoredAnError = false;
		try {
			skipWhitespace(false);
			if (!shouldParseLine()) {
				return;
			}
			parseSubject();

			skipWhitespace(true);

			parsePredicate();

			skipWhitespace(true);

			parseObject();

			skipWhitespace(true);

			assertLineTerminates();
		} catch (RDFParseException e) {
			if (!getParserConfig().get(NTriplesParserSettings.FAIL_ON_INVALID_LINES)
					|| getParserConfig().isNonFatalError(NTriplesParserSettings.FAIL_ON_INVALID_LINES)) {
				reportError(e, NTriplesParserSettings.FAIL_ON_INVALID_LINES);
				ignoredAnError = true;
			} else {
				throw e;
			}
		}
		handleStatement(ignoredAnError);
	}

	protected void skipWhitespace(boolean throwEOF) {
		while (currentIndex < lineChars.length && (lineChars[currentIndex] == ' ' || lineChars[currentIndex] == '\t')) {
			currentIndex++;
		}
		if (currentIndex >= lineChars.length && throwEOF) {
			throwEOFException();
		}
	}

	protected boolean shouldParseLine() {
		if (currentIndex < lineChars.length - 1) {
			if (lineChars[currentIndex] != '#') {
				return true;
			} else {
				if (rdfHandler != null) {
					rdfHandler.handleComment(
							new String(lineChars, currentIndex + 1, lineChars.length - currentIndex - 1));
				}
			}
		}
		return false;
	}

	protected void parseSubject() {
		if (lineChars[currentIndex] == '<') {
			subject = parseIRI();
		} else if (lineChars[currentIndex] == '_') {
			subject = parseNode();
		} else {
			throw new RDFParseException(
					"Expected '<' or '_', found: " + new String(Character.toChars(lineChars[currentIndex])), lineNo,
					lineChars[currentIndex]);
		}
	}

	protected void parsePredicate() {
		if (lineChars[currentIndex] == '<') {
			predicate = parseIRI();
		} else {
			throw new RDFParseException(
					"Expected '<', found: " + new String(Character.toChars(lineChars[currentIndex])), lineNo,
					lineChars[currentIndex]);
		}
	}

	protected void parseObject() {
		if (lineChars[currentIndex] == '<') {
			object = parseIRI();
		} else if (lineChars[currentIndex] == '_') {
			object = parseNode();
		} else if (lineChars[currentIndex] == '"') {
			parseLiteral();
		} else {
			throw new RDFParseException(
					"Expected '<' or '_', found: " + new String(Character.toChars(lineChars[currentIndex])), lineNo,
					lineChars[currentIndex]);
		}
	}

	/**
	 * Verifies that there is only whitespace or comments until the end of the line.
	 */
	protected void assertLineTerminates() throws RDFParseException {
		if (!NTriplesUtil.isDot(lineChars[currentIndex])) {
			if (lineChars[currentIndex] != '#') {
				reportFatalError("Content after '.' is not allowed");
			} else {
				return;
			}
		}
		if (lineChars.length - 1 > currentIndex) {
			currentIndex++;
			skipWhitespace(false);
			if (currentIndex >= lineChars.length) {
				return;
			}
			if (lineChars[currentIndex] != ' ' && lineChars[currentIndex] != '\t' && lineChars[currentIndex] != '#') {
				throw new RDFParseException("line must end with '.'", lineNo, currentIndex);
			}
		}
	}

	protected void handleStatement(boolean ignoredAnError) {
		if (rdfHandler != null && !ignoredAnError) {
			rdfHandler.handleStatement(valueFactory.createStatement(subject, predicate, object));
		}
		subject = null;
		predicate = null;
		object = null;
	}

	protected IRI parseIRI() {
		if (lineChars[currentIndex] != '<') {
			reportError("Supplied char should be a '<', is: " + new String(Character.toChars(lineChars[currentIndex])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		int startIndex = currentIndex + 1;
		moveToIRIEndIndex();
		IRI iri = createURI(new String(lineChars, startIndex, currentIndex - startIndex));
		currentIndex++;
		return iri;
	}

	protected Resource parseNode() {
		if (lineChars[currentIndex] != '_') {
			reportError("Supplied char should be a '_', is: " + new String(Character.toChars(lineChars[currentIndex])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		int startIndex = currentIndex + 2;
		moveToBNodeEndIndex();
		return createNode(new String(lineChars, startIndex, currentIndex - startIndex));
	}

	private void parseLiteral() {
		String label = parseLabel();
		incrementIndexOrThrowEOF();
		if (currentIndex < lineChars.length - 1 && lineChars[currentIndex] == '^') {
			parseLiteralWithDatatype(label);
		} else if (lineChars[currentIndex] == '@') {
			parseLangLiteral(label);
		} else {
			object = createLiteral(label, null, null, lineNo, lineChars[currentIndex]);
		}
	}

	private String parseLabel() {
		int startIndex = currentIndex;
		incrementIndexOrThrowEOF();
		while (lineChars[currentIndex] != '\"') {
			if (lineChars[currentIndex] == '\\') {
				currentIndex++;
			}
			incrementIndexOrThrowEOF();
		}
		try {
			return unescapeString(new String(lineChars, startIndex + 1, currentIndex - startIndex - 1));
		} catch (IllegalArgumentException e) {
			throw new RDFParseException("Illegal unicode escape sequence", lineNo, -1);
		}
	}

	private void parseLiteralWithDatatype(String label) {
		if (lineChars[currentIndex + 1] != '^') {
			reportError("Expected '^', found: " + new String(Character.toChars(lineChars[currentIndex + 1])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		currentIndex += 2;
		if (currentIndex >= lineChars.length || lineChars[currentIndex] != '<') {
			reportError("Expected '<', found: " + new String(Character.toChars(lineChars[currentIndex])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		object = createLiteral(label, null, parseIRI(), lineNo, lineChars[currentIndex]);
	}

	private void parseLangLiteral(String label) {
		incrementIndexOrThrowEOF();
		if (!ASCIIUtil.isLetter(lineChars[currentIndex])) {
			reportError("Expected a letter, found: " + new String(Character.toChars(lineChars[currentIndex])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		int startIndex = currentIndex;
		while (currentIndex < lineChars.length && (!NTriplesUtil.isDot(lineChars[currentIndex])
				&& lineChars[currentIndex] != '^'
				&& lineChars[currentIndex] != ' '
				&& lineChars[currentIndex] != '\t')) {
			currentIndex++;
		}
		if (currentIndex >= lineChars.length) {
			throwEOFException();
		}
		object = createLiteral(label, new String(lineChars, startIndex, currentIndex - startIndex), null,
				lineNo, lineChars[currentIndex]);
	}

	/**
	 * Moves the current line index position to the end of the IRI.
	 */
	private void moveToIRIEndIndex() throws RDFParseException {
		currentIndex++;
		while (currentIndex < lineChars.length && lineChars[currentIndex] != '>') {
			if (lineChars[currentIndex] == ' ') {
				reportError(
						"IRI included an unencoded space: " + new String(Character.toChars(lineChars[currentIndex])),
						BasicParserSettings.VERIFY_URI_SYNTAX);
			}
			if (lineChars[currentIndex] == '\\') {
				// This escapes the next character, which might be a '>'
				incrementIndexOrThrowEOF();
				if (lineChars[currentIndex] != 'u' && lineChars[currentIndex] != 'U') {
					reportError("IRI includes string escapes: '\\" + lineChars[currentIndex] + "'",
							BasicParserSettings.VERIFY_URI_SYNTAX);
				}
			}
			currentIndex++;
		}
		if (currentIndex >= lineChars.length) {
			throwEOFException();
		}
	}

	/**
	 * Moves the current line index position to the end of the BNode ID.
	 */
	private void moveToBNodeEndIndex() throws RDFParseException {
		incrementIndexOrThrowEOF();
		if (lineChars[currentIndex] != ':') {
			reportError("Expected ':', found: " + new String(Character.toChars(lineChars[currentIndex])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		currentIndex++;
		if (!ASCIIUtil.isLetterOrNumber(lineChars[currentIndex]) && !NTriplesUtil.isUnderscore(
				lineChars[currentIndex])) {
			reportError("Expected a letter or number or underscore, found: " + new String(
					Character.toChars(lineChars[currentIndex])),
					NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		while (currentIndex < lineChars.length && NTriplesUtil.isValidCharacterForBNodeLabel(lineChars[currentIndex])) {
			if (NTriplesUtil.isDot(lineChars[currentIndex])) {
				if (currentIndex + 1 >= lineChars.length || !NTriplesUtil.isValidCharacterForBNodeLabel(
						lineChars[currentIndex + 1])) {
					break;
				}
			}
			currentIndex++;
		}

		if (currentIndex == lineChars.length) {
			if (NTriplesUtil.isDot(lineChars[currentIndex - 1])) {
				currentIndex--;
			} else {
				throwEOFException();
			}
		}
	}

	/**
	 * Increments the current line index position and asserts EOF is not reached.
	 */
	private void incrementIndexOrThrowEOF() {
		currentIndex++;
		if (currentIndex >= lineChars.length) {
			throwEOFException();
		}
	}

	/**
	 * Attempts to read the next line from the buffered reader.
	 */
	private boolean readLine() throws IOException {
		String line = reader.readLine();
		if (line != null) {
			lineChars = line.toCharArray();
			lineNo++;
			currentIndex = 0;
			reportLocation(lineNo, 1);
			return true;
		}
		lineChars = null;
		currentIndex = -1;
		return false;
	}

	@Override
	protected IRI createURI(String uri) throws RDFParseException {
		try {
			uri = unescapeString(uri);
		} catch (IllegalArgumentException e) {
			reportError(e.getMessage(), NTriplesParserSettings.FAIL_ON_INVALID_LINES);
		}
		return super.createURI(uri);
	}

	/**
	 * Overrides {@link AbstractRDFParser#reportWarning(String)}, adding line number information to the error.
	 */
	@Override
	protected void reportWarning(String msg) {
		reportWarning(msg, lineNo, -1);
	}

	/**
	 * Overrides {@link AbstractRDFParser#reportError(String, RioSetting)}, adding line number information to the error.
	 */
	@Override
	protected void reportError(String msg, RioSetting setting) throws RDFParseException {
		reportError(msg, lineNo, -1, setting);
	}

	@Override
	protected void reportError(Exception e, RioSetting setting) throws RDFParseException {
		reportError(e, lineNo, -1, setting);
	}

	/**
	 * Overrides {@link AbstractRDFParser#reportFatalError(String)}, adding line number information to the error.
	 */
	@Override
	protected void reportFatalError(String msg) throws RDFParseException {
		reportFatalError(msg, lineNo, -1);
	}

	/**
	 * Overrides {@link AbstractRDFParser#reportFatalError(Exception)}, adding line number information to the error.
	 */
	@Override
	protected void reportFatalError(Exception e) throws RDFParseException {
		reportFatalError(e, lineNo, -1);
	}

	protected void throwEOFException() throws RDFParseException {
		throw new RDFParseException("Unexpected end of file");
	}

	@Override
	protected void clear() {
		currentIndex = -1;
		lineChars = null;
		super.clear();
	}

	/*
	 * N-Triples parser supports these settings.
	 */
	@Override
	public Collection> getSupportedSettings() {
		Collection> result = new HashSet<>(super.getSupportedSettings());

		result.add(NTriplesParserSettings.FAIL_ON_INVALID_LINES);

		return result;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy