org.eclipse.rdf4j.rio.hdt.HDTPart Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rdf4j-rio-hdt Show documentation
Show all versions of rdf4j-rio-hdt Show documentation
Experimental Rio parser and writer implementation for the HDT file format.
The newest version!
/*******************************************************************************
* Copyright (c) 2020 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.hdt;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.CheckedInputStream;
/**
* Helper class for different HDT parts.
*
* Each part starts with $HDT
, followed by a byte indicating the type of the part.
*
* Structure:
*
*
* +------+------+
* | $HDT | type |
* +------+------+
*
*
* @author Bart Hanssens
*/
abstract class HDTPart {
enum Type {
GLOBAL((byte) 1),
HEADER((byte) 2),
DICTIONARY((byte) 3),
TRIPLES((byte) 4);
private final byte value;
/**
* Get value associated with this type
*
* @return value 1,2 or 3
*/
byte getValue() {
return value;
}
Type(byte value) {
this.value = value;
}
}
final static byte[] COOKIE = "$HDT".getBytes(StandardCharsets.US_ASCII);
// TODO: make configurable, buffer for reading object values
private final static int BUFLEN = 1 * 1024 * 1024;
// for debugging purposes
final String name;
final long pos;
Map properties;
/**
* Parse from input stream
*
* @param is
* @throws IOException
*/
abstract void parse(InputStream is) throws IOException;
/**
* Get properties, if any.
*
* @return key, value map
*/
Map getProperties() {
return properties;
}
/**
* Constructor
*
* @param name part name
* @param pos starting position in input stream
*/
HDTPart(String name, long pos) {
this.name = name;
this.pos = pos;
}
/**
* Constructor
*/
HDTPart() {
this("", -1);
}
/**
* Get a string for debugging purposes, containing the name and starting position of this part.
*
* @return string
*/
String getDebugPartStr() {
if (name == null || name.isEmpty()) {
return "";
}
return (pos != -1) ? name + " (starts at byte " + pos + ")" : name;
}
/**
* Check start of part for $HDT
and the byte indicating the type
*
* @param is input stream
* @param ctype control type
* @throws IOException
*/
static void checkControl(InputStream is, HDTPart.Type ctype) throws IOException {
byte[] cookie = new byte[COOKIE.length];
is.read(cookie);
if (!Arrays.equals(cookie, COOKIE)) {
throw new IOException("$HDT marker not found");
}
byte b = (byte) is.read();
if (b != ctype.getValue()) {
throw new IOException("Info type " + Long.toHexString(b) + ", but expected different control info type");
}
}
/**
* Check for null
terminated format string.
*
* @param is
* @param format
* @throws IOException
*/
static void checkFormat(InputStream is, byte[] format) throws IOException {
byte[] b = new byte[format.length];
is.read(b);
if (!Arrays.equals(b, format)) {
throw new IOException("Unknown format, expected " + new String(format, StandardCharsets.US_ASCII));
}
is.read(); // also read null byte
}
/**
* Read null terminated series of bytes
*
* @param is input stream
* @return
* @throws IOException
*/
static byte[] readToNull(InputStream is) throws IOException {
byte[] buf = new byte[BUFLEN];
int len = 0;
do {
buf[len] = (byte) is.read();
} while (buf[len] != 0b00 && ++len < BUFLEN);
if (len == BUFLEN) {
throw new IOException("Buffer for reading properties exceeded, max " + BUFLEN);
}
return Arrays.copyOf(buf, len);
}
/**
* Get the first position of the NULL byte within an array of bytes
*
* @param b byte array
* @param start position to start from
* @return position of first NULL byte
*/
static int countToNull(byte[] b, int start) throws IOException {
for (int i = start; i < b.length; i++) {
if (b[i] == 0b00) {
return i;
}
}
throw new IOException("No null byte found in buffer starting at byte " + start);
}
/**
* Get the properties from the input stream, reading at most BUFLEN bytes. The properties are encoded as a
* key=value;
string and must be null
terminated.
*
* @param is input stream
* @return key, value map
* @throws IOException
*/
static Map getProperties(InputStream is) throws IOException {
return mapProperties(readToNull(is));
}
/**
* Get properties as a key, value map
*
* @param props
* @return
*/
static Map mapProperties(byte[] props) {
Map map = new HashMap<>();
if (props == null || props.length == 0) {
return map;
}
String[] strings = new String(props, StandardCharsets.US_ASCII).split(";");
for (String string : strings) {
String[] prop = string.split("=");
if (prop.length == 2) {
map.put(prop[0], prop[1]);
}
}
return map;
}
/**
* Get the positive integer value from a property map. Throw an exception when the property is missing, or less than
* 1.
*
* @param props property map
* @param prop name of the property
* @param name display name of the property
* @return positive integer
* @throws IOException
*/
int getIntegerProperty(Map props, String prop, String name) throws IOException {
int len;
String str = props.getOrDefault(prop, "0");
try {
len = Integer.parseInt(str);
} catch (NumberFormatException nfe) {
throw new IOException(name + " is not an integer: " + str);
}
if (len < 1) {
throw new IOException(name + " is less than 1: " + len);
}
return len;
}
/**
* Compare the calculated checksum to the expected one.
*
* @param cis checked input stream
* @param is (unchecked) input stream
* @param len number of bytes of the checksum
* @throws IOException
*/
static void checkCRC(CheckedInputStream cis, InputStream is, int len) throws IOException {
long calc = cis.getChecksum().getValue();
byte[] checksum = new byte[len];
is.read(checksum);
long expect = 0L;
// little-endian to big-endian, e.g. HDT-It stores checksum 7635 as 0x35 0x76 (at least on x86)
for (int i = len - 1; i >= 0; i--) {
expect <<= 8;
expect |= checksum[i] & 0xFF;
}
if (calc != expect) {
throw new IOException("CRC does not match: calculated " +
Long.toHexString(calc) + " instead of " + Long.toHexString(expect));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy