com.bigdata.rdf.lexicon.RDFFullTextIndexTupleSerializer Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Jun 13, 2011
 */

package com.bigdata.rdf.lexicon;

import java.io.DataInput;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;

import org.apache.log4j.Logger;
import org.apache.lucene.search.similarities.DefaultSimilarity;

import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.ShortPacker;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.search.FullTextIndexTupleSerializer;
import com.bigdata.search.ITermDocKey;
import com.bigdata.search.ITermDocRecord;
import com.bigdata.search.ITermDocVal;
import com.bigdata.search.ReadOnlyTermDocKey;
import com.bigdata.search.ReadOnlyTermDocRecord;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;

/**
 * Replaces the {@link FullTextIndexTupleSerializer} to support {@link IV}s as
 * document identifiers.
 * 
 * Since {@link IV}s have a variable length encoding we have to indicate the
 * length of the {@link IV} either in the key or the value of the {@link ITuple}
 * . I've put this information into the value side of the tuple in order to keep
 * the key format simpler.
 * 

 * Note: The RDF database does not make use of the "field" concept in the keys
 * of the full text index. The fieldId will always be reported as -1.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class RDFFullTextIndexTupleSerializer extends
        DefaultTupleSerializer {

    final private static transient Logger log = Logger
            .getLogger(RDFFullTextIndexTupleSerializer.class);

//    private boolean doublePrecision;
    
    static private final transient int NO_FIELD = -1;

//    public boolean isDoublePrecision() {
//        return doublePrecision;
//    }

    /**
     * Used to serialize the values for the tuples in the index.
     * 
     * Note: While this object is not thread-safe, the mutable B+Tree is
     * restricted to a single writer so it does not have to be thread-safe.
     */
    final transient private DataOutputBuffer buf = new DataOutputBuffer(24);

    /**
     * De-serialization constructor.
     */
    public RDFFullTextIndexTupleSerializer() {
    }

    /**
     * @param keyBuilderFactory
     *            This factory governs the Unicode collation order that will be
     *            imposed on the indexed tokens.
     * @param leafKeysCoder
     *            The coder used for the leaf keys (prefix coding is fine).
     * @param leafValsCoder
     *            The coder used for the leaf values (custom coding may provide
     *            tighter representations of the {@link ITermDocVal} s in the
     *            index entries).
     * @param fieldsEnabled
     *            When true the fieldId will be
     *            included as a component in the generated key. When
     *            false it will not be present in the generated
     *            key.
     */
    public RDFFullTextIndexTupleSerializer(//
            final IKeyBuilderFactory keyBuilderFactory,//
            final IRabaCoder leafKeysCoder, //
            final IRabaCoder leafValsCoder,//
            final boolean fieldsEnabled//
//            final boolean doublePrecision//
    ) {

        super(keyBuilderFactory, leafKeysCoder, leafValsCoder);

//        this.doublePrecision = doublePrecision;

    }

    @Override
    public byte[] serializeKey(final Object obj) {

        final ITermDocKey entry = (ITermDocKey) obj;

        final String termText = entry.getToken();
        
        final double termWeight = entry.getLocalTermWeight();
        
        /*
         * See: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_2/api/all/org/apache/lucene/search/Similarity.html
         * 
         * For more information on the round-trip of normalized term weight.
         */
        final DefaultSimilarity similarity = new DefaultSimilarity(); 
        final long termWeightCompact = similarity.encodeNormValue((float) termWeight);
        
        final IV docId = (IV)entry.getDocId();

        final IKeyBuilder keyBuilder = getKeyBuilder();

        keyBuilder.reset();

        // the token text (or its successor as desired).
        keyBuilder
                .appendText(termText, true/* unicode */, false/* successor */);

        keyBuilder.append(termWeightCompact);

        IVUtility.encode(keyBuilder, docId);

        final byte[] key = keyBuilder.getKey();

        if (log.isDebugEnabled()) {

            log.debug("{" + termText + "," + docId + "}, key="
                    + BytesUtil.toString(key));

        }

        return key;

    }

    @Override
    public byte[] serializeVal(final ITermDocVal obj) {

        final ITermDocVal val = (ITermDocVal) obj;

        if (log.isDebugEnabled()) {
            log.debug(val);
        }

        buf.reset();

//        final int termFreq = val.termFreq();
//
//        final double localTermWeight = val.getLocalTermWeight();

        final int byteLen = 
        	((IV) ((ITermDocRecord) obj).getDocId()).byteLength();
        
        if (byteLen > Short.MAX_VALUE) {
        	
        	throw new IllegalArgumentException("cannot serialize IVs longer than Short.MAX_VALUE");
        	
        }
        
        // The byte length of the document identifier IV.
        buf.packShort((short) byteLen);
        
        // The term frequency
//        buf.packLong(termFreq);
//        buf.putShort(termFreq > Short.MAX_VALUE ? Short.MAX_VALUE
//                : (short) termFreq);
//
//        // The term weight
//        if (doublePrecision)
//            buf.putDouble(localTermWeight);
//        else
//            buf.putFloat((float) localTermWeight);

        return buf.toByteArray();

    }

    @Override
    public ITermDocKey deserializeKey(final ITuple tuple) {

        return deserialize(tuple, true/* keyOnly */);

    }

    @Override
    public ITermDocRecord deserialize(final ITuple tuple) {

        return (ITermDocRecord) deserialize(tuple, false/* keyOnly */);

    }

    protected ITermDocKey deserialize(final ITuple tuple, final boolean keyOnly) {

        final ByteArrayBuffer kbuf = tuple.getKeyBuffer();

        // The byte length of the docId IV.
        final int byteLength;
        try {
//            byteLength = LongPacker.unpackInt((DataInput) tuple
//                    .getValueStream());
            byteLength = ShortPacker.unpackShort((DataInput) tuple
            		.getValueStream());
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        }
        
        final int docIdOffset = kbuf.limit() - byteLength;

        // Decode the IV.
        final IV docId = (IV) IVUtility.decodeFromOffset(kbuf.array(),
                docIdOffset);

        final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE;
        
        final byte termWeightCompact = kbuf.getByte(termWeightOffset);
        
        /*
         * See: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_2/api/all/org/apache/lucene/search/Similarity.html
         * 
         * For more information on the round-trip of normalized term weight.
         */
        
        final DefaultSimilarity similarity = new DefaultSimilarity(); 

        final double termWeight = similarity.decodeNormValue(termWeightCompact);

        if (keyOnly) {

            return new ReadOnlyTermDocKey(docId, NO_FIELD, termWeight);

        }

//        final int termFreq;
//        final double termWeight;
//        try {
//
//            final DataInputBuffer dis = tuple.getValueStream();
//
//            // skip the byte length of the IV.
//            LongPacker.unpackInt((DataInput) dis);
//            
//            termFreq = dis.readShort();
//            termFreq = LongPacker.unpackInt((DataInput) dis);

//            if (doublePrecision)
//                termWeight = dis.readDouble();
//            else
//                termWeight = dis.readFloat();
//
//        } catch (IOException ex) {
//
//            throw new RuntimeException(ex);
//
//        }

        return new ReadOnlyTermDocRecord(null/* token */, docId, NO_FIELD,
                /* termFreq, */ termWeight);

    }

    /**
     * The initial version.
     */
    private static final transient byte VERSION0 = 0;

    private static final transient byte VERSION = VERSION0;

    public void readExternal(final ObjectInput in) throws IOException,
            ClassNotFoundException {
        super.readExternal(in);
        final byte version = in.readByte();
        switch (version) {
        case VERSION0:
            break;
        default:
            throw new IOException("unknown version=" + version);
        }
//        this.doublePrecision = in.readBoolean();

    }

    public void writeExternal(final ObjectOutput out) throws IOException {
        super.writeExternal(out);
        out.writeByte(VERSION);
//        out.writeBoolean(doublePrecision);
    }

}