All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.search.FullTextIndexTupleSerializer Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Jun 13, 2011
 */

package com.bigdata.search;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;

import org.apache.log4j.Logger;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;

import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;

/**
 * Class manages the encoding and decoding of keys for the full text index. You
 * can override this class to change the way in which the keys and/or values of
 * the index are stored. For example, the RDF database does this to use variable
 * length document identifiers.
 * 
 * @author Bryan Thompson
 * @version $Id: FullTextIndexTupleSerializer.java 4702 2011-06-13 16:25:38Z
 *          thompsonbry $
 */
public class FullTextIndexTupleSerializer> extends
        DefaultTupleSerializer, ITermDocVal> {

    final private static transient Logger log = Logger
            .getLogger(FullTextIndexTupleSerializer.class);

    private boolean fieldsEnabled;
//    private boolean doublePrecision;
    
    public boolean isFieldsEnabled() {
        return fieldsEnabled;
    }

//    public boolean isDoublePrecision() {
//        return doublePrecision;
//    }
    
//    /**
//     * Used to serialize the values for the tuples in the index.
//     * 

// * Note: While this object is not thread-safe, the mutable B+Tree is // * restricted to a single writer so it does not have to be thread-safe. // */ // final transient private DataOutputBuffer buf = new DataOutputBuffer(24); /** * De-serialization constructor. */ public FullTextIndexTupleSerializer() { } /** * @param keyBuilderFactory * This factory governs the Unicode collation order that will be * imposed on the indexed tokens. * @param leafKeysCoder * The coder used for the leaf keys (prefix coding is fine). * @param leafValsCoder * The coder used for the leaf values (custom coding may provide * tighter representations of the {@link ITermDocVal}s in the * index entries). * @param fieldsEnabled * When true the fieldId will be * included as a component in the generated key. When * false it will not be present in the generated * key. */ public FullTextIndexTupleSerializer(// final IKeyBuilderFactory keyBuilderFactory,// final IRabaCoder leafKeysCoder, // final IRabaCoder leafValsCoder,// final boolean fieldsEnabled// // final boolean doublePrecision// ) { super(keyBuilderFactory, leafKeysCoder, leafValsCoder); this.fieldsEnabled = fieldsEnabled; // this.doublePrecision = doublePrecision; } @Override public byte[] serializeKey(final Object obj) { @SuppressWarnings("unchecked") final ITermDocKey entry = (ITermDocKey) obj; final String termText = entry.getToken(); final double termWeight = entry.getLocalTermWeight(); /* * See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html * * For more information on the round-trip of normalized term weight. */ final DefaultSimilarity similarity = new DefaultSimilarity(); final long termWeightCompact = similarity.encodeNormValue((float) termWeight); final V docId = entry.getDocId(); final IKeyBuilder keyBuilder = getKeyBuilder(); keyBuilder.reset(); // the token text (or its successor as desired). keyBuilder .appendText(termText, true/* unicode */, false/* successor */); keyBuilder.append(termWeightCompact); keyBuilder.append((V) docId); if (fieldsEnabled) keyBuilder.append(entry.getFieldId()); final byte[] key = keyBuilder.getKey(); if (log.isDebugEnabled()) { log.debug("{" + termText + "," + docId + (fieldsEnabled ? "," + entry.getFieldId() : "") + "}, key=" + BytesUtil.toString(key)); } return key; } @Override public byte[] serializeVal(final ITermDocVal obj) { return null; // final ITermDocVal val = (ITermDocVal) obj; // // if (log.isDebugEnabled()) { // log.debug(val); // } // // buf.reset(); // // final int termFreq = val.termFreq(); // // final double localTermWeight = val.getLocalTermWeight(); // // // The term frequency // buf.putShort(termFreq > Short.MAX_VALUE ? Short.MAX_VALUE // : (short) termFreq); // // // The term weight // if (doublePrecision) // buf.putDouble(localTermWeight); // else // buf.putFloat((float) localTermWeight); // // return buf.toByteArray(); } @Override public ITermDocKey deserializeKey(final ITuple tuple) { return deserialize(tuple, true/* keyOnly */); } @Override public ITermDocRecord deserialize(final ITuple tuple) { return (ITermDocRecord) deserialize(tuple, false/* keyOnly */); } protected ITermDocKey deserialize(final ITuple tuple, final boolean keyOnly) { // key is {term,docId,fieldId} // final byte[] key = tuple.getKey(); // // // decode the document identifier. // final long docId = KeyBuilder.decodeLong(key, key.length // - Bytes.SIZEOF_LONG /*docId*/ - Bytes.SIZEOF_INT/*fieldId*/); final ByteArrayBuffer kbuf = tuple.getKeyBuffer(); /* * The byte offset of the docId in the key. * * Note: This is also the byte length of the match on the unicode sort * key, which appears at the head of the key. */ final int docIdOffset = kbuf.limit() - Bytes.SIZEOF_LONG /* docId */ - (fieldsEnabled ? Bytes.SIZEOF_INT/* fieldId */: 0); final V docId = (V) (Object)Long.valueOf(KeyBuilder.decodeLong(kbuf.array(), docIdOffset)); // Decode field when present final int fieldId; if (fieldsEnabled) { fieldId = KeyBuilder.decodeShort(kbuf.array(), kbuf.limit() - Bytes.SIZEOF_INT); } else { fieldId = -1; } final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE; final byte termWeightCompact = kbuf.getByte(termWeightOffset); /* * See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html * * For more information on the round-trip of normalized term weight. */ final DefaultSimilarity similarity = new DefaultSimilarity(); final double termWeight = similarity.decodeNormValue(termWeightCompact); if (keyOnly) { return new ReadOnlyTermDocKey(docId, fieldId, termWeight); } // final int termFreq; // final double termWeight; // try { // // final DataInputBuffer dis = tuple.getValueStream(); // // termFreq = dis.readShort(); // // if(doublePrecision) // termWeight = dis.readDouble(); // else // termWeight = dis.readFloat(); // // } catch (IOException ex) { // // throw new RuntimeException(ex); // // } // return new ReadOnlyTermDocRecord(null/* token */, docId, fieldId, /* termFreq, */ termWeight); } /** * The initial version. */ private static final transient byte VERSION0 = 0; private static final transient byte VERSION = VERSION0; public void readExternal(final ObjectInput in) throws IOException, ClassNotFoundException { super.readExternal(in); final byte version = in.readByte(); switch (version) { case VERSION0: break; default: throw new IOException("unknown version=" + version); } this.fieldsEnabled = in.readBoolean(); // this.doublePrecision = in.readBoolean(); } public void writeExternal(final ObjectOutput out) throws IOException { super.writeExternal(out); out.writeByte(VERSION); out.writeBoolean(fieldsEnabled); // out.writeBoolean(doublePrecision); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy