com.bigdata.search.FullTextIndexTupleSerializer Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jun 13, 2011
*/
package com.bigdata.search;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import org.apache.log4j.Logger;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
/**
* Class manages the encoding and decoding of keys for the full text index. You
* can override this class to change the way in which the keys and/or values of
* the index are stored. For example, the RDF database does this to use variable
* length document identifiers.
*
* @author Bryan Thompson
* @version $Id: FullTextIndexTupleSerializer.java 4702 2011-06-13 16:25:38Z
* thompsonbry $
*/
public class FullTextIndexTupleSerializer> extends
DefaultTupleSerializer, ITermDocVal> {
final private static transient Logger log = Logger
.getLogger(FullTextIndexTupleSerializer.class);
private boolean fieldsEnabled;
// private boolean doublePrecision;
public boolean isFieldsEnabled() {
return fieldsEnabled;
}
// public boolean isDoublePrecision() {
// return doublePrecision;
// }
// /**
// * Used to serialize the values for the tuples in the index.
// *
// * Note: While this object is not thread-safe, the mutable B+Tree is
// * restricted to a single writer so it does not have to be thread-safe.
// */
// final transient private DataOutputBuffer buf = new DataOutputBuffer(24);
/**
* De-serialization constructor.
*/
public FullTextIndexTupleSerializer() {
}
/**
* @param keyBuilderFactory
* This factory governs the Unicode collation order that will be
* imposed on the indexed tokens.
* @param leafKeysCoder
* The coder used for the leaf keys (prefix coding is fine).
* @param leafValsCoder
* The coder used for the leaf values (custom coding may provide
* tighter representations of the {@link ITermDocVal}s in the
* index entries).
* @param fieldsEnabled
* When true
the fieldId
will be
* included as a component in the generated key. When
* false
it will not be present in the generated
* key.
*/
public FullTextIndexTupleSerializer(//
final IKeyBuilderFactory keyBuilderFactory,//
final IRabaCoder leafKeysCoder, //
final IRabaCoder leafValsCoder,//
final boolean fieldsEnabled//
// final boolean doublePrecision//
) {
super(keyBuilderFactory, leafKeysCoder, leafValsCoder);
this.fieldsEnabled = fieldsEnabled;
// this.doublePrecision = doublePrecision;
}
@Override
public byte[] serializeKey(final Object obj) {
@SuppressWarnings("unchecked")
final ITermDocKey entry = (ITermDocKey) obj;
final String termText = entry.getToken();
final double termWeight = entry.getLocalTermWeight();
/*
* See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html
*
* For more information on the round-trip of normalized term weight.
*/
final DefaultSimilarity similarity = new DefaultSimilarity();
final long termWeightCompact = similarity.encodeNormValue((float) termWeight);
final V docId = entry.getDocId();
final IKeyBuilder keyBuilder = getKeyBuilder();
keyBuilder.reset();
// the token text (or its successor as desired).
keyBuilder
.appendText(termText, true/* unicode */, false/* successor */);
keyBuilder.append(termWeightCompact);
keyBuilder.append((V) docId);
if (fieldsEnabled)
keyBuilder.append(entry.getFieldId());
final byte[] key = keyBuilder.getKey();
if (log.isDebugEnabled()) {
log.debug("{" + termText + "," + docId
+ (fieldsEnabled ? "," + entry.getFieldId() : "")
+ "}, key=" + BytesUtil.toString(key));
}
return key;
}
@Override
public byte[] serializeVal(final ITermDocVal obj) {
return null;
// final ITermDocVal val = (ITermDocVal) obj;
//
// if (log.isDebugEnabled()) {
// log.debug(val);
// }
//
// buf.reset();
//
// final int termFreq = val.termFreq();
//
// final double localTermWeight = val.getLocalTermWeight();
//
// // The term frequency
// buf.putShort(termFreq > Short.MAX_VALUE ? Short.MAX_VALUE
// : (short) termFreq);
//
// // The term weight
// if (doublePrecision)
// buf.putDouble(localTermWeight);
// else
// buf.putFloat((float) localTermWeight);
//
// return buf.toByteArray();
}
@Override
public ITermDocKey deserializeKey(final ITuple tuple) {
return deserialize(tuple, true/* keyOnly */);
}
@Override
public ITermDocRecord deserialize(final ITuple tuple) {
return (ITermDocRecord) deserialize(tuple, false/* keyOnly */);
}
protected ITermDocKey deserialize(final ITuple tuple,
final boolean keyOnly) {
// key is {term,docId,fieldId}
// final byte[] key = tuple.getKey();
//
// // decode the document identifier.
// final long docId = KeyBuilder.decodeLong(key, key.length
// - Bytes.SIZEOF_LONG /*docId*/ - Bytes.SIZEOF_INT/*fieldId*/);
final ByteArrayBuffer kbuf = tuple.getKeyBuffer();
/*
* The byte offset of the docId in the key.
*
* Note: This is also the byte length of the match on the unicode sort
* key, which appears at the head of the key.
*/
final int docIdOffset = kbuf.limit() - Bytes.SIZEOF_LONG /* docId */
- (fieldsEnabled ? Bytes.SIZEOF_INT/* fieldId */: 0);
final V docId = (V) (Object)Long.valueOf(KeyBuilder.decodeLong(kbuf.array(),
docIdOffset));
// Decode field when present
final int fieldId;
if (fieldsEnabled) {
fieldId = KeyBuilder.decodeShort(kbuf.array(), kbuf.limit()
- Bytes.SIZEOF_INT);
} else {
fieldId = -1;
}
final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE;
final byte termWeightCompact = kbuf.getByte(termWeightOffset);
/*
* See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html
*
* For more information on the round-trip of normalized term weight.
*/
final DefaultSimilarity similarity = new DefaultSimilarity();
final double termWeight = similarity.decodeNormValue(termWeightCompact);
if (keyOnly) {
return new ReadOnlyTermDocKey(docId, fieldId, termWeight);
}
// final int termFreq;
// final double termWeight;
// try {
//
// final DataInputBuffer dis = tuple.getValueStream();
//
// termFreq = dis.readShort();
//
// if(doublePrecision)
// termWeight = dis.readDouble();
// else
// termWeight = dis.readFloat();
//
// } catch (IOException ex) {
//
// throw new RuntimeException(ex);
//
// }
//
return new ReadOnlyTermDocRecord(null/* token */, docId, fieldId,
/* termFreq, */ termWeight);
}
/**
* The initial version.
*/
private static final transient byte VERSION0 = 0;
private static final transient byte VERSION = VERSION0;
public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
super.readExternal(in);
final byte version = in.readByte();
switch (version) {
case VERSION0:
break;
default:
throw new IOException("unknown version=" + version);
}
this.fieldsEnabled = in.readBoolean();
// this.doublePrecision = in.readBoolean();
}
public void writeExternal(final ObjectOutput out) throws IOException {
super.writeExternal(out);
out.writeByte(VERSION);
out.writeBoolean(fieldsEnabled);
// out.writeBoolean(doublePrecision);
}
}