com.bigdata.rdf.lexicon.Term2IdWriteProc Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on May 21, 2007
*/
package com.bigdata.rdf.lexicon;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.log4j.Logger;
import com.bigdata.btree.ICounter;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.AbstractLocalSplitResultAggregator;
import com.bigdata.btree.proc.IParallelizableIndexProcedure;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.btree.proc.SplitValuePair;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.LongPacker;
import com.bigdata.io.ShortPacker;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.lexicon.Term2IdWriteProc.Result;
import com.bigdata.relation.IMutableRelationIndexWriteProcedure;
import com.bigdata.service.Split;
import com.bigdata.util.BytesUtil;
/**
* This unisolated operation inserts terms into the term:id index,
* assigning identifiers to terms as a side-effect. The use of this operation
* MUST be followed by the the use of {@link Id2TermWriteProc} to ensure that
* the reverse mapping from id to term is defined before any statements are
* inserted using the assigned term identifiers. The client MUST NOT make
* assertions using the assigned term identifiers until the corresponding
* {@link Id2TermWriteProc} operation has succeeded.
*
* In order for the lexicon to remain consistent if the client fails for any
* reason after the forward mapping has been made restart-safe and before the
* reverse mapping has been made restart-safe clients MUST always use a
* successful {@link Term2IdWriteProc} followed by a successful
* {@link Id2TermWriteProc} before inserting statements using term identifiers
* into the statement indices. In particular, a client MUST NOT treat lookup
* against the terms index as satisfactory evidence that the term also exists
* in the reverse mapping.
*
* Note that it is perfectly possible that a concurrent client will overlap in
* the terms being inserted. The results will always be fully consistent if the
* rules of the road are observed since (a) unisolated operations are
* single-threaded; (b) term identifiers are assigned in an unisolated atomic
* operation by {@link Term2IdWriteProc}; and (c) the reverse mapping is made
* consistent with the assignments made/discovered by the forward mapping.
*
* Note: The {@link Term2IdWriteProc} and {@link Id2TermWriteProc} operations
* may be analyzed as a batch variant of the following pseudo code.
*
*
*
* for each term:
*
* termId = null;
*
* synchronized (ndx) {
*
* counter = ndx.getCounter();
*
* termId = ndx.lookup(term.key);
*
* if(termId == null) {
*
* termId = counter.inc();
*
* ndx.insert(term.key,termId);
*
* }
*
* }
*
*
*
* In addition, the actual operations against scale-out indices are performed on
* index partitions rather than on the whole index.
*
* @author Bryan Thompson
*/
public class Term2IdWriteProc extends AbstractKeyArrayIndexProcedure implements
IParallelizableIndexProcedure, IMutableRelationIndexWriteProcedure {
private static final Logger log = Logger.getLogger(Term2IdWriteProc.class);
// static {
// if(DEBUG) {
//
// log.removeAllAppenders();
//
// try {
// log.addAppender(new FileAppender(new SimpleLayout(),"Term2IdWriteProc.log"));
// } catch (IOException e) {
// e.printStackTrace();
// }
//
// }
// }
/**
* Flag enables optional ground truth verification. It is only enabled at
* the DEBUG level IFF this flag is ALSO set.
*
* WARNING: This IS NOT scalable!
*
* WARNING: This option IS NOT safe when using more than one triple
* store either concurrently or in sequence! For example, you can use it to
* examine a single unit test for inconsistencies, but not a sequence of unit
* tests since the data will be kept within the same global map and hence
* confound the test!
*/
private static boolean enableGroundTruth = false;
private static ConcurrentHashMap groundTruthId2Term;
private static ConcurrentHashMap groundTruthTerm2Id;
static {
if (log.isDebugEnabled() && enableGroundTruth) {
log.warn("Will track ground truth assignments");
// note: use a large initial capacity. default concurrency level is 16.
groundTruthId2Term = new ConcurrentHashMap(500000);
groundTruthTerm2Id = new ConcurrentHashMap(500000);
}
}
/**
*
*/
private static final long serialVersionUID = -4736465754523655679L;
/**
* Serialized as extended metadata. When true
unknown terms
* are NOT added to the database.
*/
private boolean readOnly;
@Override
public final boolean isReadOnly() {
return readOnly;
}
/**
* Serialized as extended metadata. When true
blank nodes
* are stored in the lexicon's forward index.
*/
private boolean storeBlankNodes;
public final boolean isStoreBlankNodes() {
return storeBlankNodes;
}
private int scaleOutTermIdBitsToReverse;
/**
* De-serialization constructor.
*/
public Term2IdWriteProc() {
}
protected Term2IdWriteProc(IRabaCoder keySer, int fromIndex,
int toIndex, byte[][] keys, boolean readOnly,
boolean storeBlankNodes, int scaleOutTermIdBitsToReverse) {
super(keySer, null, fromIndex, toIndex, keys, null /* vals */);
this.readOnly = readOnly;
this.storeBlankNodes = storeBlankNodes;
this.scaleOutTermIdBitsToReverse = scaleOutTermIdBitsToReverse;
}
public static class Term2IdWriteProcConstructor extends
AbstractKeyArrayIndexProcedureConstructor {
private final boolean readOnly;
private final boolean storeBlankNodes;
private final int scaleOutTermIdBitsToReverse;
/**
* Values ARE NOT sent.
*/
@Override
public final boolean sendValues() {
return false;
}
public Term2IdWriteProcConstructor(final boolean readOnly,
final boolean storeBlankNodes,
final int scaleOutTermIdBitsToReverse) {
this.readOnly = readOnly;
this.storeBlankNodes = storeBlankNodes;
this.scaleOutTermIdBitsToReverse = scaleOutTermIdBitsToReverse;
}
@Override
public Term2IdWriteProc newInstance(final IRabaCoder keySer,
final IRabaCoder valSer, final int fromIndex,
final int toIndex, final byte[][] keys, final byte[][] vals) {
assert vals == null;
if(log.isInfoEnabled())
log.info("TERM2ID Proc Ctor: ntuples=" + (toIndex-fromIndex));
return new Term2IdWriteProc(keySer, fromIndex, toIndex, keys,
readOnly, storeBlankNodes, scaleOutTermIdBitsToReverse);
}
}
/**
* For each term whose serialized key is mapped to the current index
* partition, lookup the term in the terms index. If it is there
* then note its assigned termId. Otherwise, use the partition local counter
* to assign the term identifier, note the term identifier so that it can be
* communicated back to the client, and insert the {term,termId} entry into
* the terms index.
*
* @param ndx
* The terms index.
*
* @return The {@link Result}, which contains the discovered / assigned
* term identifiers.
*
* TODO no point sending bnodes when readOnly.
*/
@Override
public Result applyOnce(final IIndex ndx, final IRaba keys, final IRaba vals) {
final boolean DEBUG = log.isDebugEnabled();
final int numTerms = keys.size();
assert numTerms > 0 : "numTerms="+numTerms;
// used to store the discovered / assigned term identifiers.
@SuppressWarnings("rawtypes")
final IV[] ivs = new IV[numTerms];
// used to assign term identifiers.
final ICounter counter = ndx.getCounter();
// // true iff this is an unpartitioned index.
// final boolean scaleOut = counter instanceof BTree.PartitionedCounter;
// used to serialize term identifiers.
@SuppressWarnings("resource")
final DataOutputBuffer idbuf = new DataOutputBuffer();
final TermIdEncoder encoder = readOnly ? null
: scaleOutTermIdBitsToReverse == 0 ? null : new TermIdEncoder(
scaleOutTermIdBitsToReverse);
// final DataOutputBuffer kbuf = new DataOutputBuffer(128);
// #of new terms (#of writes on the index).
int nnew = 0;
for (int i = 0; i < numTerms; i++) {
// Note: Copying the key into a buffer does not help since we need
// it in its own byte[] to do lookup against the index.
// getKeys().copy(i, kbuf.reset());
final byte[] key = keys.get(i);
// this byte encodes the kind of term (URI, Literal, BNode, etc.)
final byte code = key[0];//KeyBuilder.decodeByte(key[0]);
if (!storeBlankNodes && code == ITermIndexCodes.TERM_CODE_BND) {
/*
* Do not enter blank nodes into the forward index.
*
* For this case, we just assign a term identifier and leave it
* at that. If two different documents by some chance happen to
* specify the same blank node ID they will still be assigned
* distinct term identifiers. The only way that you can get the
* same term identifier for a blank node is to have the blank
* node ID matched in a canonicalizing map of blank nodes by the
* client. That map, of course, should be scoped to the document
* in which the blank node IDs appear.
*/
if (readOnly) {
// blank nodes can not be resolved by the index.
ivs[i] = null;
} else {
/*
* Assign a term identifier.
*
* Note: The TermIdEncoder is ONLY used in scale-out.
*/
final long ctr = counter.incrementAndGet();
final long termId = encoder == null ? ctr : encoder
.encode(ctr);
ivs[i] = new TermId(VTE(code), termId);
}
} else {
/*
* Lookup in the forward index (URIs, Literals, and SIDs)
*
* Note: Also handles BNodes iff storeBlankNodes is true
*
* @todo reuse Tuple for lookups to reduce allocation (will
* reuse an internal buffer).
*/
final byte[] tmp = ndx.lookup(key);
if (tmp == null) {
// not found.
if(readOnly) {
// not found - will not be assigned.
ivs[i] = null;
} else {
/*
* Assign a term identifier.
*
* Note: The TermIdEncoder is ONLY used in scale-out.
*/
final long ctr = counter.incrementAndGet();
final long termId = encoder == null ? ctr : encoder
.encode(ctr);
@SuppressWarnings("rawtypes")
final TermId> iv = new TermId(VTE(code), termId);
if (DEBUG && enableGroundTruth) {
groundTruthTest(key, termId, ndx, counter);
}
final byte[] bytes = iv
.encode(KeyBuilder.newInstance()).getKey();
idbuf.reset().write(bytes);
// insert into index.
if (ndx.insert(key, idbuf.toByteArray()) != null) {
throw new AssertionError();
}
nnew++;
ivs[i] = iv;
}
} else { // found.
ivs[i] = IVUtility.decode(tmp);
}
}
}
/*
* Note: this is for debugging. It does not rely on ground truth, but
* only logs information. It was originally used to track down a lost
* update problem.
*/
// if (enableGroundTruth && ndx.getIndexMetadata().getPartitionMetadata() != null) {
//
// final long v = counter.get();
// final int pid = (int) v >>> 32;
// final int ctr = (int) v;
//
// // note: the mutable btree - accessed here for debugging only.
// final BTree btree;
// if (ndx instanceof AbstractBTree) {
// btree = (BTree) ndx;
// } else {
// btree = (BTree) ((FusedView) ndx).getSources()[0];
// }
//
// log.warn("after task"+
// ": nnew="+nnew+//
// ", partitionId="+ndx.getIndexMetadata().getPartitionMetadata().getPartitionId()+//
// ", pid="+pid+//
// ", ctr="+ctr+//
// ", counter="+counter.getClass().getName()+//
// ", sourceCheckpoint="+btree.getCheckpoint()// btree was loaded from here.
// );
//
// }
return new Result(ivs);
}
private void groundTruthTest(final byte[] key, final long termId, final IIndex ndx,
final ICounter counter) {
if(groundTruthId2Term.isEmpty()) {
log.warn("Ground truth testing enabled.");
}
/*
* Note: add to map if not present. returns the value
* already stored in the map (and null if there was no value
* in the map).
*/
// remember the termId assigned to that key.
final Long oldId = groundTruthTerm2Id.putIfAbsent(key, termId);
if( oldId != null && oldId.longValue() != termId ) {
/*
* The assignment of the term identifier to the key is
* not stable.
*/
throw new AssertionError("different termId assigned"+//
": oldId=" + oldId + //
", newId=" + termId + //
", key=" + BytesUtil.toString(key)+//
", pmd="+ndx.getIndexMetadata().getPartitionMetadata());
}
// remember the key to which we assigned that termId.
final byte[] oldKey = groundTruthId2Term.putIfAbsent(termId, key);
if (oldKey != null && !BytesUtil.bytesEqual(oldKey, key)) {
/*
* The assignment of the term identifier to the key is
* not unique.
*/
// the partition identifier (assuming index is partitioned).
// final long pid = id0 >> 32;
// final long mask = 0xffffffffL;
// final int ctr = (int) (id0 & mask);
throw new AssertionError("assignment not unique"+//
": termId=" + termId +//
", oldKey=" + BytesUtil.toString(oldKey) + //
", newKey=" + BytesUtil.toString(key)+//
", pmd="+ndx.getIndexMetadata().getPartitionMetadata()+//
// ", pid="+pid+", ctr="+ctr+//
", counter="+counter+//
", counter="+counter.getClass().getName());
}
}
@Override
protected void readMetadata(final ObjectInput in) throws IOException, ClassNotFoundException {
super.readMetadata(in);
readOnly = in.readBoolean();
// scaleOutTermIds = in.readBoolean();
scaleOutTermIdBitsToReverse = (int) in.readByte();
}
/**
* Writes metadata (not the keys or values, but just other metadata used by
* the procedure).
*
* The default implementation writes toIndex - fromIndex
,
* which is the #of keys.
*
* @param out
*
* @throws IOException
*/
@Override
protected void writeMetadata(final ObjectOutput out) throws IOException {
super.writeMetadata(out);
out.writeBoolean(readOnly);
// out.writeBoolean(scaleOutTermIds);
out.writeByte((byte) scaleOutTermIdBitsToReverse);
}
final public static VTE VTE(final byte code) {
switch(code) {
case ITermIndexCodes.TERM_CODE_URI:
return VTE.URI;
case ITermIndexCodes.TERM_CODE_BND:
return VTE.BNODE;
// case ITermIndexCodes.TERM_CODE_STMT:
// return VTE.STATEMENT;
case ITermIndexCodes.TERM_CODE_DTL:
// case ITermIndexCodes.TERM_CODE_DTL2:
case ITermIndexCodes.TERM_CODE_LCL:
case ITermIndexCodes.TERM_CODE_LIT:
return VTE.LITERAL;
default:
throw new IllegalArgumentException("code=" + code);
}
}
/**
* Object encapsulates the discovered / assigned term identifiers and
* provides efficient serialization for communication of those data to the
* client.
*
* @author Bryan Thompson
*/
public static class Result implements Externalizable {
public IV[] ivs;
private static final long serialVersionUID = -8307927320589290348L;
/**
* De-serialization constructor.
*/
public Result() {
}
public Result(final IV[] ivs) {
assert ivs != null;
assert ivs.length > 0;
this.ivs = ivs;
}
private final static transient short VERSION0 = 0x0;
@Override
public void readExternal(final ObjectInput in) throws IOException, ClassNotFoundException {
final short version = ShortPacker.unpackShort(in);
if(version!=VERSION0) {
throw new IOException("Unknown version: "+version);
}
final int n = (int) LongPacker.unpackLong(in);
ivs = new IV[n];
for (int i = 0; i < n; i++) {
// ids[i] = LongPacker.unpackLong(in);
ivs[i] = (IV) in.readObject();
}
}
@Override
public void writeExternal(final ObjectOutput out) throws IOException {
final int n = ivs.length;
ShortPacker.packShort(out, VERSION0);
LongPacker.packLong(out,n);
for (int i = 0; i < n; i++) {
// LongPacker.packLong(out, ids[i]);
out.writeObject(ivs[i]);
}
}
}
/**
* {@link Split}-wise aggregation followed by combining the results across
* those splits in order to return an aggregated result whose iv[] is 1:1
* with the original keys[][].
*/
@Override
protected IResultHandler newAggregator() {
return new TermResultAggregator(getKeys().size());
}
/**
* Aggregator collects the individual results in an internal ordered map and
* assembles the final result when it is requested from the individual
* results. With this approach there is no overhead or contention when the
* results are being produced in parallel and they can be combined
* efficiently within a single thread in {@link #getResult()}.
*
* @author bryan
*/
private class TermResultAggregator extends AbstractLocalSplitResultAggregator {
/**
*
* @param size
* The #of elements in the request (which is the same as the
* cardinality of the aggregated result).
*/
public TermResultAggregator(final int size) {
super(size);
}
@Override
protected Result newResult(final int size, final SplitValuePair[] a) {
@SuppressWarnings("rawtypes")
final IV[] ivs = new IV[size];
for (int i = 0; i < a.length; i++) {
final Split split = a[i].key;
final Result tmp = a[i].val;
System.arraycopy(tmp.ivs/* src */, 0/* srcPos */, ivs/* dest */, split.fromIndex/* destPos */,
split.ntuples/* length */);
}
/*
* Return the aggregated result.
*/
final Result r = new Result(ivs);
return r;
}
} // TermResultHandler
}