com.bigdata.rdf.lexicon.LexiconKeyBuilder Maven / Gradle / Ivy

Go to download
package com.bigdata.rdf.lexicon;

import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.vocabulary.XMLSchema;

import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.StrengthEnum;

/**
 * Flyweight helper class for building (and decoding to the extent possible)
 * unsigned byte[] keys for RDF {@link Value}s and term identifiers. In general,
 * keys for RDF values are formed by a leading byte that indicates the type of
 * the value (URI, BNode, or some type of Literal), followed by the components
 * of that value type.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class LexiconKeyBuilder implements ITermIndexCodes {

    public final IKeyBuilder keyBuilder;

    /**
     * Normally invoked by {@link Term2IdTupleSerializer#getLexiconKeyBuilder()}
     * 
     * @param keyBuilder
     *            The {@link IKeyBuilder} that will determine the distinctions
     *            and sort order among the rdf {@link Value}s. In general, this
     *            should support Unicode and should use
     *            {@link StrengthEnum#Identical} so that all distinctions in the
     *            {@link Value} space are recognized by the lexicon.
     * 
     * @see IKeyBuilder
     * @see IKeyBuilderFactory
     */
    protected LexiconKeyBuilder(final IKeyBuilder keyBuilder) {

        this.keyBuilder = keyBuilder;

    }

    /**
     * Returns the sort key for the URI.
     * 
     * @param uri
     *            The URI.
     * 
     * @return The sort key.
     */
    public byte[] uri2key(final String uri) {

        return keyBuilder.reset().append(TERM_CODE_URI).append(uri).getKey();

    }

    // public byte[] uriStartKey() {
    //            
    // return keyBuilder.reset().append(TERM_CODE_URI).getKey();
    //            
    // }
    //
    // public byte[] uriEndKey() {
    //            
    // return keyBuilder.reset().append(TERM_CODE_LIT).getKey();
    //            
    // }

    public byte[] plainLiteral2key(final String text) {

        return keyBuilder.reset().append(TERM_CODE_LIT).append(text).getKey();

    }

    /**
     * Note: The language code is serialized as US-ASCII UPPER CASE for the
     * purposes of defining the total key ordering. The character set for the
     * language code is restricted to [A-Za-z0-9] and "-" for separating subtype
     * codes. The RDF store interprets an empty language code as NO language
     * code, so we require that the languageCode is non-empty here. The language
     * code specifications require that the language code comparison is
     * case-insensitive, so we force the code to upper case for the purposes of
     * comparisons.
     * 
     * @see Literal#getLanguage()
     */
    public byte[] languageCodeLiteral2key(final String languageCode,
            final String text) {

        assert languageCode.length() > 0;

        keyBuilder.reset().append(TERM_CODE_LCL);

        keyBuilder.appendASCII(languageCode.toUpperCase()).appendNul();

        return keyBuilder.append(text).getKey();

    }

    /**
     * Formats a datatype literal sort key. The value is formated according to
     * the datatype URI.
     * 
     * @param datatype
     * @param value
     * @return
     */
    public byte[] datatypeLiteral2key(final URI datatype, final String value) {

        if (datatype == null)
            throw new IllegalArgumentException();
        
        if (value == null)
            throw new IllegalArgumentException();

        if (false && datatype.equals(XMLSchema.STRING)) {

            /*
             * @todo xsd:string is explicitly mapped by RDF Semantics onto plain
             * literals (they entail one another). However, it breaks the SPARQL
             * unit tests if you map them onto the same key.
             */
            return plainLiteral2key(value);
            
        }

        /*
         * Note: The full lexical form of the data type URI is serialized into
         * the key as a Unicode sort key followed by a nul byte and then a
         * Unicode sort key formed from the lexical form of the data type value.
         */
        
        // clear out any existing key and add prefix for the DTL space.
        keyBuilder.reset().append(TERM_CODE_DTL);

        // encode the datatype URI as Unicode sort key to make all data
        // types disjoint.
        keyBuilder.append(datatype.stringValue());

        // encode the datatype value as Unicode sort key.
        keyBuilder.append(value);

        keyBuilder.appendNul();

        return keyBuilder.getKey();

    }

    // /**
    // * The key corresponding to the start of the literals section of the
    // * terms index.
    // */
    // public byte[] litStartKey() {
    //            
    // return keyBuilder.reset().append(TERM_CODE_LIT).getKey();
    //            
    // }
    //
    // /**
    // * The key corresponding to the first key after the literals section of
    // the
    // * terms index.
    // */
    // public byte[] litEndKey() {
    //            
    // return keyBuilder.reset().append(TERM_CODE_BND).getKey();
    //            
    // }

    public byte[] blankNode2Key(final String id) {

        return keyBuilder.reset().append(TERM_CODE_BND).append(id).getKey();

    }

    /**
     * Return an unsigned byte[] that locates the value within a total ordering
     * over the RDF value space.
     * 
     * @param value
     *            An RDF value.
     * 
     * @return The sort key for that RDF value.
     */
    public byte[] value2Key(final Value value) {

        if (value == null)
            throw new IllegalArgumentException();

        if (value instanceof URI) {

            final URI uri = (URI) value;

            final String term = uri.toString();

            return uri2key(term);

        } else if (value instanceof Literal) {

            final Literal lit = (Literal) value;

            final String text = lit.getLabel();

            final String languageCode = lit.getLanguage();

            final URI datatypeUri = lit.getDatatype();

            if (languageCode != null) {

                /*
                 * language code literal.
                 */
                return languageCodeLiteral2key(languageCode, text);

            } else if (datatypeUri != null) {

                /*
                 * datatype literal.
                 */
                return datatypeLiteral2key(datatypeUri, text);

            } else {

                /*
                 * plain literal.
                 */
                return plainLiteral2key(text);

            }

        } else if (value instanceof BNode) {

            /*
             * @todo if we know that the bnode id is a UUID that we generated
             * then we should encode that using faster logic that this unicode
             * conversion and stick the sort key on the bnode so that we do not
             * have to convert UUID to id:String to key:byte[].
             */
            final String bnodeId = ((BNode) value).getID();

            return blankNode2Key(bnodeId);

        } else {

            throw new AssertionError("Unknown value type: " + value.getClass());

        }

    }

}