com.bigdata.rdf.lexicon.Term2IdWriteTask Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.rdf.lexicon;
import java.util.Arrays;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.log4j.Logger;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.lexicon.Term2IdWriteProc.Term2IdWriteProcConstructor;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.pipeline.KVOList;
/**
* Synchronous RPC write on the TERM2ID index.
*
* @author Bryan Thompson
*/
public class Term2IdWriteTask implements
Callable[]> {
private static transient final Logger log = Logger
.getLogger(Term2IdWriteTask.class);
// private final LexiconRelation r;
private final IIndex termIdIndex;
private final boolean readOnly;
private final boolean storeBlankNodes;
private final int termIdBitsToReverse;
private final int numTerms;
private final BigdataValue[] terms;
private final WriteTaskStats stats;
public Term2IdWriteTask(final IIndex termIdIndex, final boolean readOnly,
final boolean storeBlankNodes, final int termIdBitsToReverse,
final int numTerms, final BigdataValue[] terms,
final WriteTaskStats stats) {
if (termIdIndex == null)
throw new IllegalArgumentException();
if (terms == null)
throw new IllegalArgumentException();
if (numTerms < 0 || numTerms > terms.length)
throw new IllegalArgumentException();
if (stats == null)
throw new IllegalArgumentException();
// this.r = r;
this.termIdIndex = termIdIndex;
this.readOnly = readOnly;
this.storeBlankNodes = storeBlankNodes;
this.termIdBitsToReverse = termIdBitsToReverse;
this.numTerms = numTerms;
this.terms = terms;
this.stats = stats;
}
/**
* Unify the {@link BigdataValue}s with the TERM2ID index, setting the
* term identifiers (TIDs) on those values as a side-effect.
*
* @return A dense {@link KVO}[] chunk consisting of only those
* distinct terms whose term identifier was not already known.
* (This may be used to write on the reverse index).
*
* @throws Exception
*/
public KVO[] call() throws Exception {
/*
* Insert into the forward index (term -> id). This will either assign a
* termId or return the existing termId if the term is already in the
* lexicon.
*/
// The #of distinct terms lacking a pre-assigned term identifier in [a].
int ndistinct = 0;
// A dense array of correlated tuples.
final KVO[] a;
{
final KVO[] b;
/*
* First make sure that each term has an assigned sort key.
*/
{
final long _begin = System.currentTimeMillis();
final Term2IdTupleSerializer tupleSer = (Term2IdTupleSerializer) termIdIndex
.getIndexMetadata().getTupleSerializer();
// may contain duplicates and/or terms with pre-assigned term
// identifiers.
b = generateSortKeys(tupleSer.getLexiconKeyBuilder(), terms,
numTerms);
stats.keyGenTime.add(System.currentTimeMillis() - _begin);
}
/*
* Sort by the assigned sort key. This places the array into the
* natural order for the term:id index.
*/
{
final long _begin = System.currentTimeMillis();
Arrays.sort(b);
stats.keySortTime.add(System.currentTimeMillis() - _begin);
}
/*
* For each distinct term that does not have a pre-assigned term
* identifier, add it to a remote unisolated batch operation that
* assigns term identifiers.
*
* Note: Both duplicate term references and terms with their term
* identifiers already assigned are dropped out in this step.
*/
{
final long _begin = System.currentTimeMillis();
/*
* Create a key buffer holding the sort keys. This does not
* allocate new storage for the sort keys, but rather aligns the
* data structures for the call to splitKeys(). This also makes
* a[] into a dense copy of the references in b[], but without
* duplicates and without terms that already have assigned term
* identifiers. Note that keys[] and a[] are correlated.
*
* @todo Could be restated as an IDuplicateRemover, but note
* that this case is specialized since it can drop terms whose
* term identifier is known (they do not need to be written on
* T2ID, but they still need to be written on the reverse index
* to ensure a robust and consistent mapping).
*/
final byte[][] keys = new byte[numTerms][];
a = new KVO[numTerms];
{
for (int i = 0; i < numTerms; i++) {
if (b[i].obj.getIV() != null) {
if (log.isDebugEnabled())
log.debug("term identifier already assigned: "
+ b[i].obj);
// term identifier already assigned.
continue;
}
if (i > 0 && b[i - 1].obj == b[i].obj) {
if (log.isDebugEnabled())
log.debug("duplicate term reference: "
+ b[i].obj);
// duplicate reference.
continue;
}
// assign to a[] (dense variant of b[]).
a[ndistinct] = b[i];
// assign to keys[] (dense and correlated with a[]).
keys[ndistinct] = b[i].key;
ndistinct++;
}
}
if (ndistinct == 0) {
/*
* Nothing to be written.
*/
return new KVO[0];
}
final AbstractKeyArrayIndexProcedureConstructor ctor =
new Term2IdWriteProcConstructor(
readOnly, storeBlankNodes, termIdBitsToReverse);
// run the procedure.
termIdIndex.submit(0/* fromIndex */, ndistinct/* toIndex */,
keys, null/* vals */, ctor,
new Term2IdWriteProcResultHandler(a, readOnly,
stats.nunknown));
stats.indexTime.addAndGet(stats.forwardIndexTime = System
.currentTimeMillis()
- _begin);
}
}
stats.ndistinct.addAndGet(ndistinct);
return KVO.dense(a, ndistinct);
} // call
/**
* Class applies the term identifiers assigned by the
* {@link Term2IdWriteProc} to the {@link BigdataValue} references in the
* {@link KVO} correlated with each {@link Split} of data processed by that
* procedure.
*
* Note: Of necessity, this requires access to the {@link BigdataValue}s
* whose term identifiers are being resolved. This implementation presumes
* that the array specified to the ctor and the array returned for each
* chunk that is processed have correlated indices and that the offset into
* {@link #a} is given by {@link Split#fromIndex}.
*
* @author Bryan Thompson
*/
static private class Term2IdWriteProcResultHandler implements
IResultHandler {
private final KVO[] a;
private final boolean readOnly;
/**
* @todo this could be the value returned by {@link #getResult()} which
* would make the API simpler.
*/
private final AtomicInteger nunknown;
/**
*
* @param a
* A dense array of {@link KVO}s.
* @param readOnly
* if readOnly was specified for the {@link Term2IdWriteProc}.
* @param nunknown
* Incremented as a side effect for each terms that could not
* be resolved (iff readOnly == true).
*/
public Term2IdWriteProcResultHandler(final KVO[] a,
final boolean readOnly, final AtomicInteger nunknown) {
if (a == null)
throw new IllegalArgumentException();
if (nunknown == null)
throw new IllegalArgumentException();
this.a = a;
this.readOnly = readOnly;
this.nunknown = nunknown;
}
/**
* Copy the assigned / discovered term identifiers onto the
* corresponding elements of the terms[].
*/
@Override
public void aggregate(final Term2IdWriteProc.Result result,
final Split split) {
for (int i = split.fromIndex, j = 0; i < split.toIndex; i++, j++) {
final IV termId = result.ivs[j];
if (termId == null) {
if (!readOnly)
throw new AssertionError();
nunknown.incrementAndGet();
} else {
// assign the term identifier.
a[i].obj.setIV(termId);
if(a[i] instanceof KVOList) {
final KVOList tmp = (KVOList) a[i];
if (!tmp.isDuplicateListEmpty()) {
// assign the term identifier to the duplicates.
tmp.map(new AssignTermId(termId));
}
}
if (log.isDebugEnabled()) {
log.debug("termId=" + termId + ", term=" + a[i].obj);
}
}
}
}
@Override
public Void getResult() {
return null;
}
}
/**
* Generate the sort keys for the terms.
*
* @param keyBuilder
* The object used to generate the sort keys.
* @param terms
* The terms whose sort keys will be generated.
* @param numTerms
* The #of terms in that array.
*
* @return An array of correlated key-value-object tuples.
*
* Note that {@link KVO#val} is null
until we know
* that we need to write it on the reverse index.
*
* @see LexiconKeyBuilder
*/
@SuppressWarnings("unchecked")
final private KVO[] generateSortKeys(
final LexiconKeyBuilder keyBuilder, final BigdataValue[] terms,
final int numTerms) {
final KVO[] a = new KVO[numTerms];
for (int i = 0; i < numTerms; i++) {
final BigdataValue term = terms[i];
a[i] = new KVO(keyBuilder.value2Key(term),
null/* val */, term);
}
return a;
}
}