com.bigdata.rdf.lexicon.TestLexiconKeyBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigdata-rdf-test Show documentation
Blazegraph(TM) RDF Test Suites
There is a newer version: 2.1.4
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Jun 30, 2011
 */

package com.bigdata.rdf.lexicon;

import java.util.Locale;
import java.util.Properties;

import junit.framework.TestCase2;

import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.impl.BNodeImpl;
import org.openrdf.model.impl.LiteralImpl;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.XMLSchema;

import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.rdf.internal.XSD;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.BytesUtil.UnsignedByteArrayComparator;

/**
 * Test suite for {@link LexiconKeyBuilder}.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class TestLexiconKeyBuilder extends TestCase2 {

    /**
     * 
     */
    public TestLexiconKeyBuilder() {
    }

    /**
     * @param name
     */
    public TestLexiconKeyBuilder(String name) {
        super(name);
    }

    private LexiconKeyBuilder fixture = null;

    /**
     * {@inheritDoc}
     * 
     * Note: The {@link LexiconKeyBuilder} will wind up configured with the
     * default {@link Locale} unless that gets overridden by
     * {@link #getProperties()}
     */
    protected void setUp() throws Exception {

        super.setUp();
        
        final IKeyBuilderFactory keyBuilderFactory = new DefaultKeyBuilderFactory(
                getProperties());

        final IKeyBuilder keyBuilder = keyBuilderFactory.getKeyBuilder();

        fixture = new LexiconKeyBuilder(keyBuilder);

    }
    
    protected void tearDown() throws Exception {
        
        fixture = null;
        
        super.tearDown();
        
    }
    
    /**
     * Tests encode of a key and the decode of its "code" byte.
     * 
     * @see ITermIndexCodes
     */
    public void test_encodeDecodeCodeByte() {
        
        assertEquals(ITermIndexCodes.TERM_CODE_URI, fixture
                .value2Key(RDF.TYPE)[0]);
        
        assertEquals(ITermIndexCodes.TERM_CODE_BND, fixture
                .value2Key(new BNodeImpl("foo"))[0]);
        
        assertEquals(ITermIndexCodes.TERM_CODE_LIT, fixture
                .value2Key(new LiteralImpl("abc"))[0]);
        
        assertEquals(ITermIndexCodes.TERM_CODE_LCL, fixture
                .value2Key(new LiteralImpl("abc","en"))[0]);        
        
        assertEquals(ITermIndexCodes.TERM_CODE_DTL, fixture
                .value2Key(new LiteralImpl("abc",XSD.BOOLEAN))[0]);        
        
    }

    /**
     * Tests the gross ordering over the different kinds of {@link Value}s but
     * deliberately does not pay attention to the sort key ordering for string
     * data.
     * 
     * @see ITermIndexCodes
     */
    public void test_keyOrder() {

        final byte[] uri = fixture.value2Key(RDF.TYPE);

        final byte[] bnd = fixture.value2Key(new BNodeImpl("foo"));

        final byte[] lit = fixture.value2Key(new LiteralImpl("abc"));

        final byte[] lcl = fixture.value2Key(new LiteralImpl("abc", "en"));

        final byte[] dtl = fixture.value2Key(new LiteralImpl("abc",
                XSD.BOOLEAN));

        // URIs before plain literals.
        assertTrue(UnsignedByteArrayComparator.INSTANCE.compare(uri, lit) < 0);

        // plain literals before language code literals.
        assertTrue(UnsignedByteArrayComparator.INSTANCE.compare(lit, lcl) < 0);

        // language code literals before datatype literals.
        assertTrue(UnsignedByteArrayComparator.INSTANCE.compare(lcl, dtl) < 0);

        // datatype literals before blank nodes.
        assertTrue(UnsignedByteArrayComparator.INSTANCE.compare(dtl, bnd) < 0);

    }
    
    public void test_uri() {
        
        final String uri1 = "http://www.cognitiveweb.org";
        final String uri2 = "http://www.cognitiveweb.org/a";
        final String uri3 = "http://www.cognitiveweb.com/a";
        
        final byte[] k1 = fixture.uri2key(uri1);
        final byte[] k2 = fixture.uri2key(uri2);
        final byte[] k3 = fixture.uri2key(uri3);

        if (log.isInfoEnabled()) {
            log.info("k1(" + uri1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(" + uri2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(" + uri3 + ") = " + BytesUtil.toString(k3));
        }
        
        // subdirectory sorts after root directory.
        assertTrue(BytesUtil.compareBytes(k1, k2)<0);
        
        // .com extension sorts before .org
        assertTrue(BytesUtil.compareBytes(k2, k3)>0);
        
    }
    
    public void test_plainLiteral() {

        final String lit1 = "abc";
        final String lit2 = "abcd";
        final String lit3 = "abcde";
        
        final byte[] k1 = fixture.plainLiteral2key(lit1);
        final byte[] k2 = fixture.plainLiteral2key(lit2);
        final byte[] k3 = fixture.plainLiteral2key(lit3);

        if (log.isInfoEnabled()) {
            log.info("k1(" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(" + lit2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(" + lit3 + ") = " + BytesUtil.toString(k3));
        }
        
        assertTrue(BytesUtil.compareBytes(k1, k2)<0);
        assertTrue(BytesUtil.compareBytes(k2, k3)<0);
        
    }
    
    public void test_languageCodeLiteral() {
        
        final String en = "en";
        final String de = "de";
        
        final String lit1 = "abc";
        final String lit2 = "abc";
        final String lit3 = "abce";
        
        final byte[] k1 = fixture.languageCodeLiteral2key(en, lit1);
        final byte[] k2 = fixture.languageCodeLiteral2key(de, lit2);
        final byte[] k3 = fixture.languageCodeLiteral2key(de, lit3);

        if (log.isInfoEnabled()) {
            log.info("k1(en:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(de:" + lit2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(de:" + lit3 + ") = " + BytesUtil.toString(k3));
        }
        
        // "en" sorts after "de".
        assertTrue(BytesUtil.compareBytes(k1, k2)>0);

        // en:abc != de:abc
        assertTrue(BytesUtil.compareBytes(k1, k2) != 0);
        
        assertTrue(BytesUtil.compareBytes(k2, k3)<0);
        
    }
    
    public void test_plain_vs_languageCode_literal() {
        
        final String en = "en";
//        String de = "de";
        
        final String lit1 = "abc";
//        String lit2 = "abc";
//        String lit3 = "abce";
//        final Literal a = new LiteralImpl("foo");
//        final Literal b = new LiteralImpl("foo", "en");

        final byte[] k1 = fixture.plainLiteral2key(lit1);
        final byte[] k2 = fixture.languageCodeLiteral2key(en, lit1);
        
        // not encoded onto the same key.
        assertFalse(BytesUtil.bytesEqual(k1, k2));
        
        // the plain literals are ordered before the language code literals.
        assertTrue(BytesUtil.compareBytes(k1, k2)<0);
        
    }

    /**
     * Verify an unknown datatype URI is coded.
     */
    public void test_datatype_unknown() {

        fixture.datatypeLiteral2key(new URIImpl("http://www.bigdata.com/foo"),
                "foo");
        
    }
    
    public void test_datatypeLiteral_xsd_boolean() {
        
        final URI datatype = XMLSchema.BOOLEAN;
        
        final String lit1 = "true";
        final String lit2 = "false";
        final String lit3 = "1";
        final String lit4 = "0";
        
        final byte[] k1 = fixture.datatypeLiteral2key(datatype,lit1);
        final byte[] k2 = fixture.datatypeLiteral2key(datatype,lit2);
        final byte[] k3 = fixture.datatypeLiteral2key(datatype,lit3);
        final byte[] k4 = fixture.datatypeLiteral2key(datatype,lit4);

        if (log.isInfoEnabled()) {
            log.info("k1(boolean:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(boolean:" + lit2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(boolean:" + lit3 + ") = " + BytesUtil.toString(k3));
            log.info("k4(boolean:" + lit4 + ") = " + BytesUtil.toString(k4));
        }
        
        assertTrue(BytesUtil.compareBytes(k1, k2) != 0);
        assertTrue(BytesUtil.compareBytes(k1, k2) > 0);

        /*
         * Note: if we do not normalize data type values then these are
         * inequalities.
         */
        assertTrue(BytesUtil.compareBytes(k1, k3) != 0); // true != 1
        assertTrue(BytesUtil.compareBytes(k2, k4) != 0); // false != 0

    }
    
    public void test_datatypeLiteral_xsd_int() {
        
        final URI datatype = XMLSchema.INT;
        
        // Note: leading zeros are ignored in the xsd:int value space.
        final String lit1 = "-4";
        final String lit2 = "005";
        final String lit3 = "5";
        final String lit4 = "6";
        
        final byte[] k1 = fixture.datatypeLiteral2key(datatype,lit1);
        final byte[] k2 = fixture.datatypeLiteral2key(datatype,lit2);
        final byte[] k3 = fixture.datatypeLiteral2key(datatype,lit3);
        final byte[] k4 = fixture.datatypeLiteral2key(datatype,lit4);

        if (log.isInfoEnabled()) {
            log.info("k1(int:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(int:" + lit2 + ") = " + BytesUtil.toString(k2));
            log.info("k2(int:" + lit3 + ") = " + BytesUtil.toString(k3));
            log.info("k4(int:" + lit4 + ") = " + BytesUtil.toString(k4));
        }
        
        assertTrue(BytesUtil.compareBytes(k1, k2) < 0);
        assertTrue(BytesUtil.compareBytes(k3, k4) < 0);

        /*
         * Note: if we do not normalize data type values then these are
         * inequalities.
         */
        assertTrue(BytesUtil.compareBytes(k2, k3) != 0); // 005 != 5

    }

    /**
     * Verify that the value spaces for long, int, short and byte are disjoint.
     */
    public void test_disjoint_value_space() {
        
        assertFalse(BytesUtil.bytesEqual(//
                fixture.datatypeLiteral2key(XMLSchema.LONG, "-1"),//
                fixture.datatypeLiteral2key(XMLSchema.INT, "-1")//
                ));

        assertFalse(BytesUtil.bytesEqual(//
                fixture.datatypeLiteral2key(XMLSchema.LONG, "-1"),//
                fixture.datatypeLiteral2key(XMLSchema.SHORT, "-1")//
                ));
        
        assertFalse(BytesUtil.bytesEqual(//
                fixture.datatypeLiteral2key(XMLSchema.LONG, "-1"),//
                fixture.datatypeLiteral2key(XMLSchema.BYTE, "-1")//
                ));

        assertFalse(BytesUtil.bytesEqual(//
                fixture.datatypeLiteral2key(XMLSchema.INT, "-1"),//
                fixture.datatypeLiteral2key(XMLSchema.SHORT, "-1")//
                ));
        
        assertFalse(BytesUtil.bytesEqual(//
                fixture.datatypeLiteral2key(XMLSchema.INT, "-1"),//
                fixture.datatypeLiteral2key(XMLSchema.BYTE, "-1")//
                ));

        assertFalse(BytesUtil.bytesEqual(//
                fixture.datatypeLiteral2key(XMLSchema.SHORT, "-1"),//
                fixture.datatypeLiteral2key(XMLSchema.BYTE, "-1")//
                ));

    }
    
    public void test_datatypeLiteral_xsd_float() {
        
        final URI datatype = XMLSchema.FLOAT;
        
        // Note: leading zeros are ignored in the xsd:int value space.
        final String lit1 = "-4.0";
        final String lit2 = "005";
        final String lit3 = "5.";
        final String lit4 = "5.0";
        final String lit5 = "6";
        
        final byte[] k1 = fixture.datatypeLiteral2key(datatype,lit1);
        final byte[] k2 = fixture.datatypeLiteral2key(datatype,lit2);
        final byte[] k3 = fixture.datatypeLiteral2key(datatype,lit3);
        final byte[] k4 = fixture.datatypeLiteral2key(datatype,lit4);
        final byte[] k5 = fixture.datatypeLiteral2key(datatype,lit5);

        if (log.isInfoEnabled()) {
            log.info("k1(float:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(float:" + lit2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(float:" + lit3 + ") = " + BytesUtil.toString(k3));
            log.info("k4(float:" + lit3 + ") = " + BytesUtil.toString(k4));
            log.info("k5(float:" + lit5 + ") = " + BytesUtil.toString(k5));
        }

        assertTrue(BytesUtil.compareBytes(k1, k2) < 0);
        assertTrue(BytesUtil.compareBytes(k4, k5) < 0);

        /*
         * Note: if we do not normalize data type values then these are
         * inequalities.
         */
        assertTrue(BytesUtil.compareBytes(k2, k3) != 0); // 005 != 5.
        assertTrue(BytesUtil.compareBytes(k3, k4) != 0); // 5. != 5.0

    }
    
    public void test_datatypeLiteral_xsd_double() {
        
        final URI datatype = XMLSchema.DOUBLE;
        
        // Note: leading zeros are ignored in the xsd:int value space.
        final String lit1 = "-4.0";
        final String lit2 = "005";
        final String lit3 = "5.";
        final String lit4 = "5.0";
        final String lit5 = "6";
        
        final byte[] k1 = fixture.datatypeLiteral2key(datatype,lit1);
        final byte[] k2 = fixture.datatypeLiteral2key(datatype,lit2);
        final byte[] k3 = fixture.datatypeLiteral2key(datatype,lit3);
        final byte[] k4 = fixture.datatypeLiteral2key(datatype,lit4);
        final byte[] k5 = fixture.datatypeLiteral2key(datatype,lit5);

        if (log.isInfoEnabled()) {
            log.info("k1(double:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(double:" + lit2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(double:" + lit3 + ") = " + BytesUtil.toString(k3));
            log.info("k4(double:" + lit3 + ") = " + BytesUtil.toString(k4));
            log.info("k5(double:" + lit5 + ") = " + BytesUtil.toString(k5));
        }

        assertTrue(BytesUtil.compareBytes(k1, k2) < 0);
        assertTrue(BytesUtil.compareBytes(k4, k5) < 0);

        /*
         * Note: if we do not normalize data type values then these are
         * inequalities.
         */
        assertTrue(BytesUtil.compareBytes(k2, k3) != 0); // 005 != 5.
        assertTrue(BytesUtil.compareBytes(k3, k4) != 0); // 5. != 5.0

    }

    /**
     * Verify that some value spaces are disjoint.
     */
    public void test_datatypeLiteral_xsd_int_not_double_or_float() {
        
        final String lit1 = "4";
        
        final byte[] k0 = fixture.datatypeLiteral2key(XMLSchema.INT, lit1);
        final byte[] k1 = fixture.datatypeLiteral2key(XMLSchema.FLOAT, lit1);
        final byte[] k2 = fixture.datatypeLiteral2key(XMLSchema.DOUBLE, lit1);

        if (log.isInfoEnabled()) {
            log.info("k0(float:" + lit1 + ") = " + BytesUtil.toString(k0));
            log.info("k1(float:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(double:" + lit1 + ") = " + BytesUtil.toString(k2));
        }

        assertTrue(BytesUtil.compareBytes(k0, k1) != 0);
        assertTrue(BytesUtil.compareBytes(k0, k2) != 0);
        
    }

    /**
     * Verify that some value spaces are disjoint.
     */
    public void test_datatypeLiteral_xsd_float_not_double() {
        
        final String lit1 = "04.21";
        
        final byte[] k1 = fixture.datatypeLiteral2key(XMLSchema.FLOAT,lit1);
        final byte[] k2 = fixture.datatypeLiteral2key(XMLSchema.DOUBLE,lit1);

        if (log.isInfoEnabled()) {
            log.info("k1(float:" + lit1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(double:" + lit1 + ") = " + BytesUtil.toString(k2));
        }

        assertTrue(BytesUtil.compareBytes(k1, k2) != 0);
        
    }
    
    public void test_blankNode() {
        
        final String id1 = "_12";
        final String id2 = "_abc";
        final String id3 = "abc";
        
        final byte[] k1 = fixture.blankNode2Key(id1);
        final byte[] k2 = fixture.blankNode2Key(id2);
        final byte[] k3 = fixture.blankNode2Key(id3);

        if (log.isInfoEnabled()) {
            log.info("k1(bnodeId:" + id1 + ") = " + BytesUtil.toString(k1));
            log.info("k2(bnodeId:" + id2 + ") = " + BytesUtil.toString(k2));
            log.info("k3(bnodeId:" + id3 + ") = " + BytesUtil.toString(k3));
        }
        
        assertTrue(BytesUtil.compareBytes(k1, k2)<0);
        assertTrue(BytesUtil.compareBytes(k2, k3)<0);
        
    }

    /**
     * Test verifies the ordering among URIs, Literals, and BNodes. This
     * ordering is important when batching terms of these different types into
     * the term index since you want to insert the type types according to this
     * order for the best performance.
     */
    public void test_termTypeOrder() {

        /*
         * one key of each type. the specific values for the types do not matter
         * since we are only interested in the relative order between those
         * types in this test.
         */
        
        final byte[] k1 = fixture.uri2key("http://www.cognitiveweb.org");
        final byte[] k2 = fixture.plainLiteral2key("hello world!");
        final byte[] k3 = fixture.blankNode2Key("a12");
        
        assertTrue(BytesUtil.compareBytes(k1, k2)<0);
        assertTrue(BytesUtil.compareBytes(k2, k3)<0);
        
    }
    
    /**
     * This is an odd issue someone reported for the trunk. There are two
     * version of a plain Literal Brian McCarthy, but it appears
     * that one of the two versions has a leading bell character when you decode
     * the Unicode byte[]. I think that this is actually an issue with the
     * {@link Locale} and the Unicode sort key generation. If {@link KeyBuilder}
     * as configured on the system generates Unicode sort keys which compare as
     * EQUAL for these two inputs then that will cause the lexicon to report an
     * "apparent" inconsistency. In fact, what we probably need to do is just
     * disable the inconsistency check in the lexicon.
     * 
     * 
     * ERROR: com.bigdata.rdf.lexicon.Id2TermWriteProc.apply(Id2TermWriteProc.java:205): val=[0, 2, 0, 14, 66, 114, 105, 97, 110, 32, 77, 99, 67, 97, 114, 116, 104, 121]
     * ERROR: com.bigdata.rdf.lexicon.Id2TermWriteProc.apply(Id2TermWriteProc.java:206): oldval=[0, 2, 0, 15, 127, 66, 114, 105, 97, 110, 32, 77, 99, 67, 97, 114, 116, 104, 121]
     * 
     */
    public void test_consistencyIssue() {

        final BigdataValueSerializer fixture = new BigdataValueSerializer(
                ValueFactoryImpl.getInstance());

        final byte[] newValBytes = new byte[] { 0, 2, 0, 14, 66, 114, 105, 97, 110, 32,
                77, 99, 67, 97, 114, 116, 104, 121 };

        final byte[] oldValBytes = new byte[] { 0, 2, 0, 15, 127, 66, 114, 105,
                97, 110, 32, 77, 99, 67, 97, 114, 116, 104, 121 };

        final Value newValue = fixture.deserialize(newValBytes);

        final Value oldValue = fixture.deserialize(oldValBytes);

        if (log.isInfoEnabled()) {
            log.info("new=" + newValue);
            log.info("old=" + oldValue);
        }

        /*
         * Note: This uses the default Locale and the implied Unicode collation
         * order to generate the sort keys.
         */
//        final IKeyBuilder keyBuilder = new KeyBuilder();

        /*
         * Note: This allows you to explicitly configure the behavior of the
         * KeyBuilder instance based on the specified properties.  If you want
         * your KB to run with these properties, then you need to specify them
         * either in your environment or using -D to java.
         */
        final Properties properties = new Properties();
        
        // specify that all aspects of the Unicode sequence are significant.
        properties.setProperty(KeyBuilder.Options.STRENGTH,StrengthEnum.Identical.toString());
        
//        // specify that that only primary character differences are significant.
//        properties.setProperty(KeyBuilder.Options.STRENGTH,StrengthEnum.Primary.toString());
        
        final IKeyBuilder keyBuilder = KeyBuilder
                .newUnicodeInstance(properties);

        final LexiconKeyBuilder lexKeyBuilder = new LexiconKeyBuilder(
                keyBuilder);

        // encode as unsigned byte[] key.
        final byte[] newValKey = lexKeyBuilder.value2Key(newValue);

        final byte[] oldValKey = lexKeyBuilder.value2Key(oldValue);

        if (log.isInfoEnabled()) {
            log.info("newValKey=" + BytesUtil.toString(newValKey));
            log.info("oldValKey=" + BytesUtil.toString(oldValKey));
        }

        /*
         * Note: if this assert fails then the two distinct Literals were mapped
         * onto the same unsigned byte[] key.
         */
        assertFalse(BytesUtil.bytesEqual(newValKey, oldValKey));

    }

}