com.bigdata.rdf.lexicon.LexiconKeyBuilder Maven / Gradle / Ivy
package com.bigdata.rdf.lexicon;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.vocabulary.XMLSchema;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.StrengthEnum;
/**
* Flyweight helper class for building (and decoding to the extent possible)
* unsigned byte[] keys for RDF {@link Value}s and term identifiers. In general,
* keys for RDF values are formed by a leading byte that indicates the type of
* the value (URI, BNode, or some type of Literal), followed by the components
* of that value type.
*
* @author Bryan Thompson
* @version $Id$
*/
public class LexiconKeyBuilder implements ITermIndexCodes {
public final IKeyBuilder keyBuilder;
/**
* Normally invoked by {@link Term2IdTupleSerializer#getLexiconKeyBuilder()}
*
* @param keyBuilder
* The {@link IKeyBuilder} that will determine the distinctions
* and sort order among the rdf {@link Value}s. In general, this
* should support Unicode and should use
* {@link StrengthEnum#Identical} so that all distinctions in the
* {@link Value} space are recognized by the lexicon.
*
* @see IKeyBuilder
* @see IKeyBuilderFactory
*/
protected LexiconKeyBuilder(final IKeyBuilder keyBuilder) {
this.keyBuilder = keyBuilder;
}
/**
* Returns the sort key for the URI.
*
* @param uri
* The URI.
*
* @return The sort key.
*/
public byte[] uri2key(final String uri) {
return keyBuilder.reset().append(TERM_CODE_URI).append(uri).getKey();
}
// public byte[] uriStartKey() {
//
// return keyBuilder.reset().append(TERM_CODE_URI).getKey();
//
// }
//
// public byte[] uriEndKey() {
//
// return keyBuilder.reset().append(TERM_CODE_LIT).getKey();
//
// }
public byte[] plainLiteral2key(final String text) {
return keyBuilder.reset().append(TERM_CODE_LIT).append(text).getKey();
}
/**
* Note: The language code is serialized as US-ASCII UPPER CASE for the
* purposes of defining the total key ordering. The character set for the
* language code is restricted to [A-Za-z0-9] and "-" for separating subtype
* codes. The RDF store interprets an empty language code as NO language
* code, so we require that the languageCode is non-empty here. The language
* code specifications require that the language code comparison is
* case-insensitive, so we force the code to upper case for the purposes of
* comparisons.
*
* @see Literal#getLanguage()
*/
public byte[] languageCodeLiteral2key(final String languageCode,
final String text) {
assert languageCode.length() > 0;
keyBuilder.reset().append(TERM_CODE_LCL);
keyBuilder.appendASCII(languageCode.toUpperCase()).appendNul();
return keyBuilder.append(text).getKey();
}
/**
* Formats a datatype literal sort key. The value is formated according to
* the datatype URI.
*
* @param datatype
* @param value
* @return
*/
public byte[] datatypeLiteral2key(final URI datatype, final String value) {
if (datatype == null)
throw new IllegalArgumentException();
if (value == null)
throw new IllegalArgumentException();
if (false && datatype.equals(XMLSchema.STRING)) {
/*
* @todo xsd:string is explicitly mapped by RDF Semantics onto plain
* literals (they entail one another). However, it breaks the SPARQL
* unit tests if you map them onto the same key.
*/
return plainLiteral2key(value);
}
/*
* Note: The full lexical form of the data type URI is serialized into
* the key as a Unicode sort key followed by a nul byte and then a
* Unicode sort key formed from the lexical form of the data type value.
*/
// clear out any existing key and add prefix for the DTL space.
keyBuilder.reset().append(TERM_CODE_DTL);
// encode the datatype URI as Unicode sort key to make all data
// types disjoint.
keyBuilder.append(datatype.stringValue());
// encode the datatype value as Unicode sort key.
keyBuilder.append(value);
keyBuilder.appendNul();
return keyBuilder.getKey();
}
// /**
// * The key corresponding to the start of the literals section of the
// * terms index.
// */
// public byte[] litStartKey() {
//
// return keyBuilder.reset().append(TERM_CODE_LIT).getKey();
//
// }
//
// /**
// * The key corresponding to the first key after the literals section of
// the
// * terms index.
// */
// public byte[] litEndKey() {
//
// return keyBuilder.reset().append(TERM_CODE_BND).getKey();
//
// }
public byte[] blankNode2Key(final String id) {
return keyBuilder.reset().append(TERM_CODE_BND).append(id).getKey();
}
/**
* Return an unsigned byte[] that locates the value within a total ordering
* over the RDF value space.
*
* @param value
* An RDF value.
*
* @return The sort key for that RDF value.
*/
public byte[] value2Key(final Value value) {
if (value == null)
throw new IllegalArgumentException();
if (value instanceof URI) {
final URI uri = (URI) value;
final String term = uri.toString();
return uri2key(term);
} else if (value instanceof Literal) {
final Literal lit = (Literal) value;
final String text = lit.getLabel();
final String languageCode = lit.getLanguage();
final URI datatypeUri = lit.getDatatype();
if (languageCode != null) {
/*
* language code literal.
*/
return languageCodeLiteral2key(languageCode, text);
} else if (datatypeUri != null) {
/*
* datatype literal.
*/
return datatypeLiteral2key(datatypeUri, text);
} else {
/*
* plain literal.
*/
return plainLiteral2key(text);
}
} else if (value instanceof BNode) {
/*
* @todo if we know that the bnode id is a UUID that we generated
* then we should encode that using faster logic that this unicode
* conversion and stick the sort key on the bnode so that we do not
* have to convert UUID to id:String to key:byte[].
*/
final String bnodeId = ((BNode) value).getID();
return blankNode2Key(bnodeId);
} else {
throw new AssertionError("Unknown value type: " + value.getClass());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy