org.eclipse.rdf4j.rio.hdt.HDTParser Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2020 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.hdt;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.input.CountingInputStream;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
/**
* RDF parser for HDT v1.0 files. This parser is not thread-safe, therefore its public methods are synchronized.
*
* Unfortunately the draft specification is not entirely clear and probably slightly out of date, since the open source
* reference implementation HDT-It seems to implement a slightly different version. This parser tries to be compatible
* with HDT-It 1.0.
*
* The most important parts are the Dictionaries containing the actual values (S, P, O part of a triple), and the
* Triples containing the numeric references to construct the triples.
*
* Since objects in one triple are often subjects in another triple, these "shared" parts are stored in a shared
* Dictionary, which may significantly reduce the file size.
*
* File structure:
*
*
* +---------------------+
* | Global |
* | Header |
* | Dictionary (Shared) |
* | Dictionary (S) |
* | Dictionary (P) |
* | Dictionary (O) |
* | Triples |
* +---------------------+
*
*
* @author Bart Hanssens
* @see HDT draft (2015)
* @see W3C Member Submission (2011)
*/
public class HDTParser extends AbstractRDFParser {
/**
* Creates a new HDTParser that will use a {@link SimpleValueFactory} to create RDF model objects.
*/
public HDTParser() {
super();
}
/**
* Creates a new HDTParser that will use the supplied ValueFactory to create RDF model objects.
*
* @param valueFactory A ValueFactory.
*/
public HDTParser(ValueFactory valueFactory) {
super(valueFactory);
}
@Override
public RDFFormat getRDFFormat() {
return RDFFormat.HDT;
}
@Override
public Collection> getSupportedSettings() {
return Set.of();
}
@Override
public synchronized void parse(InputStream in, String baseURI)
throws IOException, RDFParseException, RDFHandlerException {
if (in == null) {
throw new IllegalArgumentException("Input stream must not be 'null'");
}
// if (in instanceof FileInputStream) {
// TODO: use more optimized way to parse the file, eg. filechannel / membuffer
// }
HDTDictionarySection shared = null;
HDTDictionarySection subjects = null;
HDTDictionarySection predicates = null;
HDTDictionarySection objects = null;
HDTTriplesSection section = null;
// not using try-with-resources, since the counter is needed in the catch clause (JDK8)
CountingInputStream bis = new CountingInputStream(in);
try {
reportLocation(0, -1);
HDTGlobal global = new HDTGlobal();
global.parse(bis);
Map globalProps = global.getProperties();
String base = globalProps.getOrDefault(HDTGlobal.GLOBAL_BASEURI, "");
if (!base.isEmpty()) {
setBaseURI(base);
}
reportLocation(bis.getByteCount(), -1);
HDTHeader header = new HDTHeader();
header.parse(bis);
reportLocation(bis.getByteCount(), -1);
new HDTDictionary().parse(bis);
long dpos = bis.getByteCount();
reportLocation(dpos, -1);
shared = HDTDictionarySectionFactory.parse(bis, "S+O", dpos);
shared.parse(bis);
dpos = bis.getByteCount();
reportLocation(dpos, -1);
subjects = HDTDictionarySectionFactory.parse(bis, "S", dpos);
subjects.parse(bis);
dpos = bis.getByteCount();
reportLocation(dpos, -1);
predicates = HDTDictionarySectionFactory.parse(bis, "P", dpos);
predicates.parse(bis);
dpos = bis.getByteCount();
reportLocation(dpos, -1);
objects = HDTDictionarySectionFactory.parse(bis, "O", dpos);
objects.parse(bis);
reportLocation(bis.getByteCount(), -1);
HDTTriples triples = new HDTTriples();
triples.parse(bis);
reportLocation(bis.getByteCount(), -1);
section = HDTTriplesSectionFactory.parse(new String(HDTTriples.FORMAT_BITMAP));
section.parse(bis, triples.getOrder());
} catch (IOException ioe) {
reportFatalError(ioe.getMessage(), bis.getCount(), -1);
} finally {
bis.close();
}
if (rdfHandler != null) {
rdfHandler.startRDF();
}
assert shared != null;
int size = shared.size();
assert section != null;
while (section.hasNext()) {
int[] t = section.next();
byte[] s = getSO(t[0], size, shared, subjects);
byte[] p = predicates.get(t[1]);
byte[] o = getSO(t[2], size, shared, objects);
Statement stmt = valueFactory.createStatement(createSubject(s), createPredicate(p), createObject(o));
if (rdfHandler != null) {
rdfHandler.handleStatement(stmt);
}
}
if (rdfHandler != null) {
rdfHandler.endRDF();
}
}
/**
* Not supported, since HDT is a binary format.
*/
@Override
public synchronized void parse(Reader reader, String baseURI)
throws RDFParseException, RDFHandlerException {
throw new UnsupportedOperationException("HDT is binary, text readers not supported.");
}
/**
* Get part of triple from shared HDT Dictionary or (if not found) from specific HDT Dictionary
*
* @param pos position
* @param size size of shared Dictionary
* @param shared shared Dictionary
* @param other specific Dictionary
* @return subject or object
*/
private byte[] getSO(int pos, int size, HDTDictionarySection shared, HDTDictionarySection other)
throws IOException {
return (pos <= size) ? shared.get(pos) : other.get(pos - size);
}
private boolean isBNodeID(byte[] b) {
// HDT-It generates "genid" for blank nodes in RDF/XML
return (b[0] == '_' || (b.length > 5 && b[0] == 'g' && b[1] == 'e'));
}
/**
* Create subject IRI or blank node
*
* @param b byte buffer
* @return IRI or blank node
*/
private Resource createSubject(byte[] b) {
String str = new String(b, StandardCharsets.UTF_8);
return isBNodeID(b) ? valueFactory.createBNode(str) : valueFactory.createIRI(str);
}
/**
* Create predicate IRI
*
* @param b byte buffer
* @return IRI
*/
private IRI createPredicate(byte[] b) {
return valueFactory.createIRI(new String(b, StandardCharsets.UTF_8));
}
/**
* Create object (typed) literal, IRI or blank node
*
* @param b byte buffer
* @return literal, IRI or blank node
*/
private Value createObject(byte[] b) {
if (b[0] == '"') {
int i = b.length - 1;
for (; i > 1 && b[i] != '"'; i--) {
if (b[i] == '@') {
String lang = new String(b, i + 1, b.length - i - 1, StandardCharsets.US_ASCII);
return valueFactory.createLiteral(new String(b, 1, i - 2, StandardCharsets.UTF_8), lang);
} else if (b[i] == '^') {
IRI datatype = valueFactory
.createIRI(new String(b, i + 2, b.length - i - 3, StandardCharsets.US_ASCII));
return valueFactory.createLiteral(new String(b, 1, i - 3, StandardCharsets.UTF_8), datatype);
}
}
return valueFactory.createLiteral(new String(b, 1, i - 1, StandardCharsets.UTF_8));
}
String str = new String(b, StandardCharsets.UTF_8);
return isBNodeID(b) ? valueFactory.createBNode(str) : valueFactory.createIRI(str);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy