org.eclipse.rdf4j.rio.hdt.HDTDictionarySectionPFC Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rdf4j-rio-hdt Show documentation
Show all versions of rdf4j-rio-hdt Show documentation
Experimental Rio parser and writer implementation for the HDT file format.
The newest version!
/*******************************************************************************
* Copyright (c) 2020 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.hdt;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.zip.CheckedInputStream;
import org.eclipse.rdf4j.common.io.UncloseableInputStream;
/**
* HDT DictionarySection Plain Front Coding.
*
* This part starts with a byte indicating the type of the dictionary section, followed by the VByte-encoded number of
* strings, the VByte-encoded buffer size and the VByte-encoded buffer length.
*
* Then the 8-bit CRC.
*
* Followed by an array and one or more buffers, and the 32-bit CRC calculated over the index and the buffers.
*
* Structure:
*
*
* +------+--------------+--------------+-------+------+-------+--------+...+-------+
* | type | totalStrings | stringsBlock | array | CRC8 | index | buffer |...| CRC32 |
* +------+--------------+--------------+-------+------+-------+--------+...+-------+
*
*
* Each buffer starts with a full string, followed by a maximum of stringsBlock
- 1 pair of a VByte-encoded
* number of characters this string has in common with the _previous_ string, and the (different) suffix.
*
* E.g. abcdef 2 gh 3 ij
will result in abcde, abgh, abgij
.
*
* Buffer structure:
*
*
* +--------+--------+--------+...+--------+--------+
* | string | common | suffix |...| common | suffix |
* +--------+--------+--------+...+--------+--------+
*
*
* @author Bart Hanssens
*/
class HDTDictionarySectionPFC extends HDTDictionarySection {
private byte[] buffer;
private int totalStrings;
private int stringsBlock;
private HDTArray blockStarts;
// keep most recently used blocks in memory as decoded values
private final LinkedHashMap> cache = new LinkedHashMap<>(100, 1, true) {
@Override
protected boolean removeEldestEntry(Map.Entry eldest) {
return size() > 99;
}
};
/**
* Constructor
*
* @param name
* @param pos
*/
HDTDictionarySectionPFC(String name, long pos) {
super(name, pos);
}
@Override
int size() {
return totalStrings;
}
@Override
byte[] get(int i) throws IOException {
// HDT index start counting from 1
int idx = i - 1;
// get the block this string belongs to, and maintain the cache of recently used blocks
int block = idx / stringsBlock;
ArrayList strings = cache.get(block);
if (strings == null) {
int blockStart = blockStarts.get(block);
strings = decodeBlock(block, blockStart);
cache.put(block, strings);
}
return strings.get(idx - (block * stringsBlock));
}
@Override
void parse(InputStream is) throws IOException {
CRC8 crc8 = new CRC8();
crc8.update((byte) HDTDictionarySection.Type.FRONT.getValue());
int buflen;
// don't close CheckedInputStream, as it will close the underlying inputstream
try (UncloseableInputStream uis = new UncloseableInputStream(is);
CheckedInputStream cis = new CheckedInputStream(uis, crc8)) {
long val = VByte.decode(cis);
if (val > Integer.MAX_VALUE) {
throw new UnsupportedOperationException(getDebugPartStr() + " max number of strings exceeded: " + val);
}
totalStrings = (int) val;
val = VByte.decode(cis);
if (val > Integer.MAX_VALUE) {
throw new UnsupportedOperationException(getDebugPartStr() + " max buffer length exceeded: " + val);
}
buflen = (int) val;
val = VByte.decode(cis);
if (val > Integer.MAX_VALUE) {
throw new UnsupportedOperationException(
getDebugPartStr() + "max number of strings per exceeded: " + val);
}
stringsBlock = (int) val;
checkCRC(cis, is, 1);
}
// keep track of starting positions of the blocks
blockStarts = HDTArrayFactory.parse(is);
blockStarts.parse(is);
// don't close CheckedInputStream, as it will close the underlying inputstream
try (UncloseableInputStream uis = new UncloseableInputStream(is);
CheckedInputStream cis = new CheckedInputStream(uis, new CRC32())) {
buffer = new byte[buflen];
cis.read(buffer);
checkCRC(cis, is, 4);
}
}
/**
* Parse a single block
*
* @param block block number
* @param start starting position
* @return list of decoded byte strings
* @throws IOException
*/
private ArrayList decodeBlock(int block, int start) throws IOException {
ArrayList arr = new ArrayList<>(stringsBlock);
// initial string
int idx = start;
int end = HDTPart.countToNull(buffer, idx);
byte[] str = Arrays.copyOfRange(buffer, idx, end);
arr.add(str);
idx = end + 1;
// read the remaining strings, with a maximum of stringsBlock
int remaining = totalStrings - (block * stringsBlock);
for (int j = 1; j < stringsBlock && j < remaining; j++) {
int common = (int) VByte.decodeFrom(buffer, idx);
idx += VByte.encodedLength(common);
end = HDTPart.countToNull(buffer, idx);
byte[] suffix = Arrays.copyOfRange(buffer, idx, end);
// copy the common part and add the suffix
str = Arrays.copyOf(str, common + suffix.length);
System.arraycopy(suffix, 0, str, common, suffix.length);
arr.add(str);
idx = end + 1;
}
return arr;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy