![JAR search and dependency download from the Maven repository](/logo.png)
com.bigdata.rdf.rio.StatementBuffer Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jan 27, 2007
*/
package com.bigdata.rdf.rio;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import org.apache.log4j.Logger;
import org.openrdf.model.BNode;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.vocabulary.RDF;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.ICounterSetAccess;
import com.bigdata.counters.Instrument;
import com.bigdata.counters.OneShotInstrument;
import com.bigdata.jsr166.LinkedBlockingQueue;
import com.bigdata.rdf.changesets.ChangeAction;
import com.bigdata.rdf.changesets.ChangeRecord;
import com.bigdata.rdf.changesets.IChangeLog;
import com.bigdata.rdf.model.BigdataBNode;
import com.bigdata.rdf.model.BigdataBNodeImpl;
import com.bigdata.rdf.model.BigdataResource;
import com.bigdata.rdf.model.BigdataStatement;
import com.bigdata.rdf.model.BigdataURI;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.StatementEnum;
import com.bigdata.rdf.spo.ISPO;
import com.bigdata.rdf.spo.SPO;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.IRawTripleStore;
import com.bigdata.rdf.store.TempTripleStore;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.accesspath.IElementFilter;
import com.bigdata.striterator.ChunkedArrayIterator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.util.concurrent.LatchedExecutor;
/**
* A write buffer for absorbing the output of the RIO parser or other
* {@link Statement} source and writing that output onto an
* {@link AbstractTripleStore} using the batch API.
*
* Note: there is a LOT of {@link Value} duplication in parsed RDF and we get a
* significant reward for reducing {@link Value}s to only the distinct
* {@link Value}s during processing. On the other hand, there is little
* {@link Statement} duplication. Hence we pay an unnecessary overhead if we try
* to make the statements distinct in the buffer.
*
* Note: This also provides an explanation for why neither this class nor writes
* of SPOs do better when "distinct" statements is turned on - the "Value"
* objects in that case are only represented by long integers and duplication in
* their values does not impose a burden on either the heap or the index
* writers. In contrast, the duplication of {@link Value}s in the
* {@link StatementBuffer} imposes a burden on both the heap and the index
* writers.
*
* @author Bryan Thompson
*/
public class StatementBuffer implements IStatementBuffer, ICounterSetAccess {
final private static Logger log = Logger.getLogger(StatementBuffer.class);
// final static private boolean INFO = log.isInfoEnabled();
final static private boolean DEBUG = log.isDebugEnabled();
/**
* Buffer for parsed RDF {@link Value}s.
*/
protected final BigdataValue[] values;
/**
* Buffer for parsed RDF {@link Statement}s.
*/
protected final BigdataStatement[] stmts;
/**
* #of valid entries in {@link #values}.
*/
protected int numValues;
/**
* #of valid entries in {@link #stmts}.
*/
protected int numStmts;
/**
* The total number of statements accepted by the {@link StatementBuffer}.
* This can include statements that are currently buffered as well as those
* that have already been queued or written. This is a running total and
* does not attempt to avoid counting duplicates.
*/
private long numTotalStmts;
/**
* @todo consider tossing out these counters - they only add complexity to
* the code in {@link #handleStatement(Resource, URI, Value, StatementEnum)}.
*/
protected int numURIs, numLiterals, numBNodes;
/**
* The #of blank nodes which appear in the context position and zero (0) if
* statement identifiers are not enabled.
*/
protected int numSIDs;
/**
* Map used to filter out duplicate terms. The use of this map provides
* a ~40% performance gain.
*/
final private Map distinctTermMap;
/**
* A canonicalizing map for blank nodes. This map MUST be cleared before you
* begin to add statements to the buffer from a new "source" otherwise it
* will co-reference blank nodes from distinct sources. The life cycle of
* the map is the life cycle of the document being loaded, so if you are
* loading a large document with a lot of blank nodes the map will also
* become large.
*/
private Map bnodes;
/**
* The #of blank nodes, which are not resolved and thus will require adding
* to values array while running {@link #incrementalWrite()}
* @see https://jira.blazegraph.com/browse/BLZG-1708 (DataLoader fails with ArrayIndexOutOfBoundsException)
*/
private int bnodesUnresolvedCount;
/**
* The #of blank nodes, which were resolved and thus will not require adding
* to values array while running {@link #incrementalWrite()}
* This variable is made volatile, as it is updated from {@link DrainQueueCallable}
* and should be available at the time of access from {@link #nearCapacity()}
* @see https://jira.blazegraph.com/browse/BLZG-1708 (DataLoader fails with ArrayIndexOutOfBoundsException)
*/
private volatile int bnodesResolvedCount;
/**
* Statements which use blank nodes in their {s,p,o} positions must be
* deferred when statement identifiers are enabled until (a) either the
* blank node is observed in the context position of a statement; or (b)
* {@link #flush()} is invoked, indicating that no more data will be loaded
* from the current source and therefore that the blank node is NOT a
* statement identifier. This map is used IFF statement identifiers are
* enabled. When statement identifiers are NOT enabled blank nodes are
* always blank nodes and we do not need to defer statements, only maintain
* the canonicalizing {@link #bnodes} mapping.
*/
private Set deferredStmts;
/**
* RDR statements. Map to a bnode used in other statements. Need to defer
* both the reified statement (since it comes in piecemeal) and the
* statements about it (since we need to make sure the ground version is
* present).
*/
private Map reifiedStmts;
/**
* true
if statement identifiers are enabled.
*
* Note: This is set by the ctor but temporarily overridden during
* {@link #processDeferredStatements()} in order to reuse the
* {@link StatementBuffer} for batch writes of the deferred statement as
* well.
*
* @see AbstractTripleStore#getStatementIdentifiers()
*/
private boolean statementIdentifiers;
/**
* When non-null
the statements will be written on this store.
* When null
the statements are written onto the
* {@link #database}. (This is used to support incremental truth
* maintenance.)
*/
private final AbstractTripleStore statementStore;
/**
* The optional store into which statements will be inserted when non-
* null
.
*/
@Override
public final AbstractTripleStore getStatementStore() {
return statementStore;
}
/**
* The database that will be used to resolve terms. When
* {@link #statementStore} is null
, statements will be written
* into this store as well.
*/
protected final AbstractTripleStore database;
/**
* The arity of the SPORelation for the {@link #getDatabase()}.
*/
private final int arity;
/**
* The database that will be used to resolve terms. When
* {@link #getStatementStore()} is null
, statements will be
* written into this store as well.
*/
@Override
public final AbstractTripleStore getDatabase() {
return database;
}
protected final BigdataValueFactory valueFactory;
/**
* Reification vocabulary.
*/
private final BigdataURI RDF_SUBJECT;
private final BigdataURI RDF_PREDICATE;
private final BigdataURI RDF_OBJECT;
private final BigdataURI RDF_STATEMENT;
private final BigdataURI RDF_TYPE;
/**
* The maximum #of Statements, URIs, Literals, or BNodes that the buffer can
* hold. The minimum capacity is three (3) since that corresponds to a
* single triple where all terms are URIs.
*/
private final int bufferCapacity;
/**
* The maximum #of Statements, URIs, Literals, or BNodes that the buffer can
* hold. The minimum capacity is three (3) since that corresponds to a
* single triple where all terms are URIs.
*/
public int getCapacity() {
return bufferCapacity;
}
// /**
// * When true only distinct terms are stored in the buffer (this is always
// * true since this condition always outperforms the alternative).
// */
// protected final boolean distinct = true;
/**
* The capacity of the optional {@link #queue} used to overlap the parser
* with the index writer -or- ZERO (0) iff the queue is disabled and index
* writes will be synchronous and alternate with the parser (the historical
* behavior).
*/
private final int queueCapacity;
/**
* The #of batches added to the {@link #queue}.
*/
private int batchAddCount;
/**
* The #of batches taken from the {@link #queue}.
*/
private int batchTakeCount;
/**
* The number of batches merged.
*/
private int batchMergeCount;
/**
* The #of batches written onto the database.
*/
private int batchWriteCount;
/**
* When non-null, this is a deque that will be used allow the parser to race
* ahead. Once the writes on the statement indices are done, the queue can
* be drained to a thread that will then merge the batches and batch them
* through to the database.
*
* @see BLZG-641
* @see BLZG-1522
*/
private final LinkedBlockingQueue> queue;
/**
* When non-null, this is a single threaded executor that will be used to
* drain {@link #queue} and batch updates through to the database.
*/
private final Executor executor;
/**
* When the {@link #queue} is being used, this is the {@link Future} of the
* current task (if any) that is writing the current {@link Batch} onto the
* database.
*
* Note: This is lazily initialized since {@link #reset()} does not have the
* semantics of "close()" and the {@link StatementBuffer} MIGHT be reused.
*/
private volatile FutureTask ft;
/**
* The capacity of the optional queue used to overlap the parser with the
* index writer -or- ZERO (0) iff the queue is disabled and index writes
* will be synchronous and alternate with the parser (the historical
* behavior).
*
* @see BLZG-1552
*/
public int getQueueCapacity() {
return queueCapacity;
}
@Override
public boolean isEmpty() {
return numStmts == 0;
}
@Override
public int size() {
return numStmts;
}
@Override
public String toString() {
return "numURIs=" + numURIs
+ ", numLiterals=" + numLiterals
+ ", numBNodes=" + numBNodes
+ ", numStmts=" + numStmts
+ ", numValues=" + numValues
+ ", numSids=" + numSIDs
+ ", values.length=" + (values != null ? String.valueOf(values.length) : "null")
+ ", stmts.length=" + (stmts != null ? String.valueOf(stmts.length) : "null")
+ ", bnodes.size()=" + (bnodes != null ? String.valueOf(bnodes.size()) : "null")
+ ", distinctTermMap.size()=" + (distinctTermMap != null ? String.valueOf(distinctTermMap.size()) : "null")
+ ", reifiedStmts.size()=" + (reifiedStmts != null ? String.valueOf(reifiedStmts.size()) : "null")
+ ", deferredStmts.size()=" + (deferredStmts != null ? String.valueOf(deferredStmts.size()) : "null")//
+ (queue == null ? "" : ", queue.size=" + queue.size())//
;
}
@Override
public CounterSet getCounters() {
final CounterSet counters = new CounterSet();
counters.addCounter("readOnly", new OneShotInstrument(readOnly));
counters.addCounter("bnodesSize", new Instrument() {
@Override
public void sample() {
final Map t = bnodes;
if (t != null)
setValue(t.size());
}
});
counters.addCounter("bnodesUnresolvedCount", new Instrument() {
@Override
public void sample() {
setValue(bnodesUnresolvedCount);
}
});
counters.addCounter("bnodesResolvedCount", new Instrument() {
@Override
public void sample() {
setValue(bnodesResolvedCount);
}
});
counters.addCounter("distinctTermMapSize", new Instrument() {
@Override
public void sample() {
final Map t = distinctTermMap;
if (t != null)
setValue(t.size());
}
});
counters.addCounter("bufferCapacity", new OneShotInstrument(bufferCapacity));
// Note: tracked even when the queue is not enabled.
counters.addCounter("batchAddCount", new Instrument() {
@Override
public void sample() {
setValue(batchAddCount);
}
});
// Note: tracked even when the queue is not enabled.
counters.addCounter("batchWriteCount", new Instrument() {
@Override
public void sample() {
setValue(batchWriteCount);
}
});
if (queue != null) {
// Only defined when the queue is enabled.
counters.addCounter("queueCapacity", new OneShotInstrument(queueCapacity));
counters.addCounter("queueSize", new Instrument() {
@Override
public void sample() {
final LinkedBlockingQueue> t = queue;
if (t != null)
setValue(t.size());
}
});
counters.addCounter("batchTakeCount", new Instrument() {
@Override
public void sample() {
setValue(batchTakeCount);
}
});
counters.addCounter("batchMergeCount", new Instrument() {
@Override
public void sample() {
setValue(batchMergeCount);
}
});
}
return counters;
}
/**
* When invoked, the {@link StatementBuffer} will resolve terms against the
* lexicon, but not enter new terms into the lexicon. This mode can be used
* to efficiently resolve terms to {@link SPO}s.
*
* @todo Use an {@link IBuffer} pattern can be used to make the statement
* buffer chunk-at-a-time. The buffer has a readOnly argument and will
* visit SPOs for the source statements. When readOnly, new terms will
* not be added to the database.
*
* @todo Once we have the {@link SPO}s we can just feed them into whatever
* consumer we like and do bulk completion, bulk filtering, write the
* SPOs onto the database,etc.
*
* @todo must also support the focusStore patterns, which should not be too
* difficult.
*/
public void setReadOnly() {
this.readOnly = true;
}
private boolean readOnly = false;
/**
* Set an {@link IChangeLog} listener that will be notified about each
* statement actually written onto the backing store.
*
* @param changeLog
* The change log listener.
*/
public void setChangeLog(final IChangeLog changeLog) {
this.changeLog = changeLog;
}
/**
* When non-null, this is an {@link IChangeLog} listener that will be
* notified about each statement actually written onto the backing store.
*/
private IChangeLog changeLog;
/**
* Note: The use of this interface is NOT encouraged. It is used to hook the
* axioms in {@link com.bigdata.rdf.axioms.BaseAxioms}. Ideally this could
* be backed out in favor of using the {@link IChangeLog} but I was not able
* to make that work out very easily.
*
* @author bryan
*
* @param
*
* @see BLZG-1552
*/
public interface IWrittenSPOArray {
/**
* A callback that is invoked with the statements actually written onto the
* backing store. The default implementation is a NOP.
*
* @param stmts
* An array of the statements written onto the backing store.
* @param numStmts
* The number of entries in that array that were written.
*/
void didWriteSPOs(final SPO[] stmts, final int numStmts);
}
protected IWrittenSPOArray didWriteCallback = null;
/**
* Create a buffer that converts Sesame {@link Value} objects to {@link SPO}
* s and writes on the database when it is {@link #flush()}ed. This
* may be used to perform efficient batch write of Sesame {@link Value}s or
* {@link Statement}s onto the database. If you already have
* {@link SPO}s then use
* {@link IRawTripleStore#addStatements(IChunkedOrderedIterator, IElementFilter)}
* and friends.
*
* @param database
* The database into which the termS and statements will be
* inserted.
* @param capacity
* The #of statements that the buffer can hold.
*/
public StatementBuffer(final AbstractTripleStore database, final int capacity) {
this(database, capacity, 10/* defaultQueueCapacity */);
}
public StatementBuffer(final AbstractTripleStore database, final int capacity, final int queueCapacity) {
this(null/* statementStore */, database, capacity, queueCapacity);
}
/**
* Create a buffer that writes on a {@link TempTripleStore} when it is
* {@link #flush()}ed. This variant is used during truth maintenance since
* the terms are written on the database lexicon but the statements are
* asserted against the {@link TempTripleStore}.
*
* @param statementStore
* The store into which the statements will be inserted
* (optional). When null
, both statements and terms
* will be inserted into the database. This optional
* argument provides the ability to load statements into a
* temporary store while the terms are resolved against the main
* database. This facility is used during incremental load+close
* operations.
* @param database
* The database. When statementStore is null
,
* both terms and statements will be inserted into the
* database.
* @param capacity
* The #of statements that the buffer can hold.
* @param queueCapacity
* The capacity of blocking queue used by the
* {@link StatementBuffer} -or- ZERO (0) to disable the blocking
* queue and perform synchronous writes (default is
* {@value #DEFAULT_QUEUE_CAPACITY} statements). The blocking
* queue holds parsed data pending writes onto the backing store
* and makes it possible for the parser to race ahead while
* writer is blocked writing onto the database indices.
*
* @see BLZG-1552 (added blocking queue)
*/
public StatementBuffer(final TempTripleStore statementStore,
final AbstractTripleStore database, final int capacity,
final int queueCapacity
) {
if (database == null)
throw new IllegalArgumentException();
if (capacity <= 0)
throw new IllegalArgumentException();
if (queueCapacity < 0)
throw new IllegalArgumentException();
this.statementStore = statementStore; // MAY be null.
this.database = database;
this.arity = database.getSPOKeyArity();
this.valueFactory = database.getValueFactory();
this.bufferCapacity = capacity;
this.queueCapacity = queueCapacity;
values = new BigdataValue[capacity * arity + 5];
stmts = new BigdataStatement[capacity];
/*
* initialize capacity to N times the #of statements allowed. this
* is the maximum #of distinct terms and would only be realized if
* each statement used distinct values. in practice the #of distinct
* terms will be much lower. however, also note that the map will be
* resized at .75 of the capacity so we want to over-estimate the
* maximum likely capacity by at least 25% to avoid re-building the
* hash map.
*/
distinctTermMap = new HashMap(capacity * arity);
this.statementIdentifiers = database.getStatementIdentifiers();
if(log.isInfoEnabled()) {
log.info("capacity=" + capacity + ", sids=" + statementIdentifiers
+ ", statementStore=" + statementStore + ", database="
+ database + ", arity=" + arity);
}
this.RDF_SUBJECT = valueFactory.asValue(RDF.SUBJECT);
this.RDF_PREDICATE = valueFactory.asValue(RDF.PREDICATE);
this.RDF_OBJECT = valueFactory.asValue(RDF.OBJECT);
this.RDF_STATEMENT = valueFactory.asValue(RDF.STATEMENT);
this.RDF_TYPE = valueFactory.asValue(RDF.TYPE);
/*
* Get the reification vocabulary into the distinct term map.
*/
getDistinctTerm(RDF_SUBJECT, true);
getDistinctTerm(RDF_PREDICATE, true);
getDistinctTerm(RDF_OBJECT, true);
getDistinctTerm(RDF_STATEMENT, true);
getDistinctTerm(RDF_TYPE, true);
/**
* TODO BLZG-1522. There is some odd interaction with SIDS that causes a
* thrown exception from BigdataBNodeImpl.getIV() when the queue is used
* with sids....
*
*
* throw new UnificationException("illegal self-referential sid");
*
*/
if (true && !statementIdentifiers && queueCapacity != 0) {
/*
* Setup a deque that will be used allow the parser to race ahead.
* Once the writes on the statement indices are done, the queue can
* be drained to a thread that will then merge the batches and batch
* them through to the database.
*
* @see BLZG-641
*
* @see BLZG-1522
*
* @see BLZG-1813
*/
queue = new LinkedBlockingQueue>(queueCapacity/* capacity */);
/*
* Setup executor used to drain the queue, merge the batches and
* write on the backing store.
*
* Note: executor is backed by the database executor service and has
* a maximum parallelism of one.
*/
executor = new LatchedExecutor(database.getExecutorService(), 1/* nparallel */);
} else {
/*
* Do not use the queue. incrementalWrite() will synchronously write
* onto the backing store.
*/
queue = null;
executor = null;
ft = null;
}
}
/**
* Added to ensure that the {@link FutureTask} is cancelled in case the
* caller does not shutdown the {@link StatementBuffer} normally.
*/
@Override
protected void finalize() throws Throwable {
super.finalize();
if (ft != null) {
reset();
}
}
// /**
// * Evict a batch (blocking put, but spins to look for an error in
// * {@link Future} for the thread draining the queue.
// *
// * @param batch
// * A batch (required).
// *
// * @throws InterruptedException
// */
// private void putOnQueue(final Batch batch) throws InterruptedException {
//
// Future f;
// while ((f = ft) != null && !f.isDone()) {
//
// if (queue.offer(batch, 100L, TimeUnit.MILLISECONDS)) {
//
// return;
//
// }
//
// }
//
// if (f == null) {
//
// /*
// * The Future of the task draining the queue has been cleared (most
// * likely due to an error or interrupt). At this point nothing more
// * will be drained from the queue.
// */
//
// throw new RuntimeException("Writer is done, but reader still working?");
//
// } else if (f.isDone()) {
//
// /*
// * This is most likely to indicate either an error or interrupt in
// * the writer. At this point nothing more will be drained from the
// * queue.
// */
//
// throw new RuntimeException("Writer is done, but reader still working?");
//
// }
//
// }
/**
* Drains {@link Batch}es from the queue and writes on the database.
*
* @author bryan
*
* @see BLZG-1522
*/
private class DrainQueueCallable implements Callable {
private boolean exhausted = false;
@Override
public Void call() throws Exception {
while (!exhausted) {
// Block and wait for a batch.
final Batch batch = queue.take();
if (batch == Batch.POISON_PILL) {
// Done.
exhausted = true;
continue;
} else batchTakeCount++;
if (queue.isEmpty()) {
// Nothing else in the queue. Write out the batch immediately.
final BatchResult batchResult = batch.writeNow();
bnodesResolvedCount += batchResult.getNumBNodesResolved();
batchWriteCount++;
continue;
}
drainQueueAndMergeBatches(batch);
} // block and wait for the next batch.
// done.
return null;
} // call()
/**
* There is more in the queue. Drain it. Watch out for that poison pill!
*
* Note: Maximum from drainTo() is queueCapacity. Plus 1 since we
* already have one batch on hand.
*/
private void drainQueueAndMergeBatches(final Batch batch) {
if (batch == null)
throw new IllegalArgumentException();
if (batch == Batch.POISON_PILL) // DO NOT pass the poisen pill!
throw new IllegalArgumentException();
final List> avail = new LinkedList>();
// add the batch already on hand (from caller)
avail.add(batch);
// drain the queue while queue is *known* to contain something.
while (!exhausted && !queue.isEmpty()) {
// non-blocking take. should be available immediately. but
// there *might* have been a clear() call.
final Batch anotherBatch = queue.poll();
if (anotherBatch == null) {
// Note: This could arise through a concurrent clear of
// the queue.
exhausted = true;
} else if (anotherBatch == Batch.POISON_PILL) {
// Done.
exhausted = true;
} else {
// Add to the set that we will merge together.
avail.add(anotherBatch);
batchTakeCount++;
}
}
if (avail.size() == 1) {
/*
* Safety check. Do not merge a single batch.
*/
final BatchResult batchResult = avail.get(0).writeNow();
bnodesResolvedCount += batchResult.getNumBNodesResolved();
batchWriteCount++;
} else {
// Merge the batches together and then write them out.
final BatchResult batchResult = new MergeUtility().merge(avail).writeNow();
bnodesResolvedCount += batchResult.getNumBNodesResolved();
batchMergeCount += avail.size();
batchWriteCount++;
}
}
} // DrainQueueCallable
/**
* Signals the end of a source and causes all buffered statements to be
* written.
*
* Note: The source limits the scope within which blank nodes are
* co-referenced by their IDs. Calling this method will flush the buffer,
* cause any deferred statements to be written, and cause the canonicalizing
* mapping for blank nodes to be discarded.
*
* @todo this implementation always returns ZERO (0).
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public long flush() {
// log.warn("");
/*
* Process deferred statements (NOP unless using statement identifiers).
*/
// processDeferredStatements();
// flush anything left in the buffer.
incrementalWrite();
if (queue != null) {
// Drop a poison pill on the queue.
try {
queue.put((Batch) Batch.POISON_PILL);
// block and wait until the flush is done.
final Future ft = this.ft;
if (ft != null) {
ft.get();
}
} catch (InterruptedException e) {
// Cancel task and propagate interrupt.
ft.cancel(true/* mayInterruptIfRunning */);
Thread.currentThread().interrupt();
} catch (ExecutionException e) {
// Wrap and throw.
throw new RuntimeException(e);
}
}
// discard all buffer state (including bnodes and deferred statements).
reset();
return 0L;
}
// /**
// * Processes the {@link #deferredStmts deferred statements}.
// *
// * When statement identifiers are enabled the processing of statements using
// * blank nodes in their subject or object position must be deferred until we
// * know whether or not the blank node is being used as a statement
// * identifier (blank nodes are not allowed in the predicate position by the
// * RDF data model). If the blank node is being used as a statement
// * identifier then its {@link IV} will be assigned based on
// * the {s,p,o} triple. If it is being used as a blank node, then the
// * {@link IV} is assigned using the blank node ID.
// *
// * Deferred statements are processed as follows:
// *
// *
// * - Collect all deferred statements whose blank node bindings never show
// * up in the context position of a statement (
// * {@link BigdataBNode#getStatementIdentifier()} is
false
).
// * Those blank nodes are NOT statement identifiers so we insert them into
// * the lexicon and the insert the collected statements as well.
// *
// * - The remaining deferred statements are processed in "cliques". Each
// * clique consists of all remaining deferred statements whose {s,p,o} have
// * become fully defined by virtue of a blank node becoming bound as a
// * statement identifier. A clique is collected by a full pass over the
// * remaining deferred statements. This process repeats until no statements
// * are identified (an empty clique or fixed point).
// *
// *
// * If there are remaining deferred statements then they contain cycles. This
// * is an error and an exception is thrown.
// *
// * @todo on each {@link #flush()}, scan the deferred statements for those
// * which are fully determined (bnodes are flagged as statement
// * identifiers) to minimize the build up for long documents?
// */
// protected void processDeferredStatements() {
//
// if (!statementIdentifiers || deferredStmts == null
// || deferredStmts.isEmpty()) {
//
// // NOP.
//
// return;
//
// }
//
// if (log.isInfoEnabled())
// log.info("processing " + deferredStmts.size()
// + " deferred statements");
//
// /*
// * Need to flush the terms out to the dictionary or the reification
// * process will not work correctly.
// */
// incrementalWrite();
//
// try {
//
// // Note: temporary override - clear by finally{}.
// statementIdentifiers = false;
//
// // stage 0
// if (reifiedStmts != null) {
//
// for (Map.Entry e : reifiedStmts.entrySet()) {
//
// final BigdataBNodeImpl sid = e.getKey();
//
// final ReifiedStmt reifiedStmt = e.getValue();
//
// if (!reifiedStmt.isFullyBound(arity)) {
//
// log.warn("unfinished reified stmt: " + reifiedStmt);
//
// continue;
//
// }
//
// final BigdataStatement stmt = valueFactory.createStatement(
// reifiedStmt.getSubject(),
// reifiedStmt.getPredicate(),
// reifiedStmt.getObject(),
// reifiedStmt.getContext(),
// StatementEnum.Explicit);
//
// sid.setStatement(stmt);
//
// sid.setIV(new SidIV(new SPO(stmt)));
//
// if (log.isInfoEnabled()) {
// log.info("reified sid conversion: sid=" + sid + ", stmt=" + stmt);
// }
//
// }
//
// if (log.isInfoEnabled()) {
//
// for (BigdataBNodeImpl sid : reifiedStmts.keySet()) {
//
// log.info("sid: " + sid + ", iv=" + sid.getIV());
//
// }
//
// }
//
// }
//
// // stage 1.
// {
//
// final int nbefore = deferredStmts.size();
//
// int n = 0;
//
// final Iterator itr = deferredStmts.iterator();
//
// while(itr.hasNext()) {
//
// final BigdataStatement stmt = itr.next();
//
// if (stmt.getSubject() instanceof BNode
// && ((BigdataBNode) stmt.getSubject()).isStatementIdentifier())
// continue;
//
// if (stmt.getObject() instanceof BNode
// && ((BigdataBNode) stmt.getObject()).isStatementIdentifier())
// continue;
//
// if(log.isDebugEnabled()) {
// log.debug("grounded: "+stmt);
// }
//
// if (stmt.getSubject() instanceof BNode)
// addTerm(stmt.getSubject());
//
// if (stmt.getObject() instanceof BNode)
// addTerm(stmt.getObject());
//
// // fully grounded so add to the buffer.
// add(stmt);
//
// // the statement has been handled.
// itr.remove();
//
// n++;
//
// }
//
// if (log.isInfoEnabled())
// log.info(""+ n
// + " out of "
// + nbefore
// + " deferred statements used only blank nodes (vs statement identifiers).");
//
// /*
// * Flush everything in the buffer so that the blank nodes that
// * are really blank nodes will have their term identifiers
// * assigned.
// */
//
// incrementalWrite();
//
// }
//
// // stage 2.
// if(!deferredStmts.isEmpty()) {
//
// int nrounds = 0;
//
// while(true) {
//
// nrounds++;
//
// final int nbefore = deferredStmts.size();
//
// final Iterator itr = deferredStmts.iterator();
//
// while(itr.hasNext()) {
//
// final BigdataStatement stmt = itr.next();
//
// if (log.isDebugEnabled()) {
// log.debug(stmt.getSubject() + ", iv=" + stmt.s());
// }
//
// if (stmt.getSubject() instanceof BNode
// && ((BigdataBNode) stmt.getSubject()).isStatementIdentifier()
// && stmt.s() == null)
// continue;
//
// if (stmt.getObject() instanceof BNode
// && ((BigdataBNode) stmt.getObject()).isStatementIdentifier()
// && stmt.o() == null)
// continue;
//
// if (log.isDebugEnabled()) {
// log.debug("round="+nrounds+", grounded: "+stmt);
// }
//
// // fully grounded so add to the buffer.
// add(stmt);
//
// // deferred statement has been handled.
// itr.remove();
//
// }
//
// final int nafter = deferredStmts.size();
//
// if (log.isInfoEnabled())
// log.info("round=" + nrounds+" : #before="+nbefore+", #after="+nafter);
//
// if(nafter == nbefore) {
//
// if (log.isInfoEnabled())
// log.info("fixed point after " + nrounds
// + " rounds with " + nafter
// + " ungrounded statements");
//
// break;
//
// }
//
// /*
// * Flush the buffer so that we can obtain the statement
// * identifiers for all statements in this clique.
// */
//
// incrementalWrite();
//
// } // next clique.
//
// final int nremaining = deferredStmts.size();
//
// if (nremaining > 0) {
//
// if (log.isDebugEnabled()) {
//
// for (BigdataStatement s : deferredStmts) {
// log.debug("could not ground: " + s);
// }
//
// }
//
// throw new StatementCyclesException(
// "" + nremaining
// + " statements can not be grounded");
//
// }
//
//
// } // stage 2.
//
// } finally {
//
// // Note: restore flag!
// statementIdentifiers = true;
//
// deferredStmts = null;
//
// reifiedStmts = null;
//
// }
//
// }
/**
* Clears all buffered data, including the canonicalizing mapping for blank
* nodes and deferred provenance statements.
*/
@Override
public void reset() {
_clear();
/*
* Note: clear the reference NOT the contents of the map! This makes it
* possible for the caller to reuse the same map across multiple
* StatementBuffer instances.
*/
bnodes = null;
bnodesUnresolvedCount = 0;
bnodesResolvedCount = 0;
deferredStmts = null;
reifiedStmts = null;
if (queue != null) {
final Future ft = this.ft;
if (ft != null) {
// Cancel any running task.
ft.cancel(true/* mayInterruptIfRunning */);
this.ft = null;
}
// Clear the queue.
queue.clear();
}
}
/**
* @todo could be replaced with {@link BigdataValueFactory
*/
@Override
public void setBNodeMap(final Map bnodes) {
if (bnodes == null)
throw new IllegalArgumentException();
if (this.bnodes != null)
throw new IllegalStateException();
this.bnodes = bnodes;
bnodesUnresolvedCount = 0;
bnodesResolvedCount = 0;
for (BigdataBNode bnode: bnodes.values()) {
if (bnode.getIV() == null) {
bnodesUnresolvedCount++;
}
}
}
/**
* Invoked by {@link #incrementalWrite()} to clear terms and statements
* which have been written in preparation for buffering more writes. This
* does NOT discard either the canonicalizing mapping for blank nodes NOR
* any deferred statements.
*/
protected void _clear() {
// Avoid potential IndexOutOfBoundsException in _clear().
// @see https://jira.blazegraph.com/browse/BLZG-1708 (DataLoader fails with ArrayIndexOutOfBoundsException)
final int nvalues = Math.min(values.length, numValues);
final int nstmts = Math.min(stmts.length, numStmts);
for (int i = 0; i < nvalues; i++) {
values[i] = null;
}
for (int i = 0; i < nstmts; i++) {
stmts[i] = null;
}
numURIs = numLiterals = numBNodes = numStmts = numValues = 0;
numSIDs = 0;
if (distinctTermMap != null) {
distinctTermMap.clear();
/*
* Get the reification vocabulary into the distinct term map.
*/
getDistinctTerm(RDF_SUBJECT, true);
getDistinctTerm(RDF_PREDICATE, true);
getDistinctTerm(RDF_OBJECT, true);
getDistinctTerm(RDF_STATEMENT, true);
getDistinctTerm(RDF_TYPE, true);
}
// clearBNodeMap();
}
/**
* Batch insert buffered data (terms and statements) into the store.
*/
protected void incrementalWrite() {
/*
* Look for non-sid bnodes and add them to the values to be written
* to the database (if they haven't already been written).
*/
if (bnodes != null) {
for (BigdataBNode bnode : bnodes.values()) {
// sid, skip
if (bnode.isStatementIdentifier())
continue;
// already written, skip
if (bnode.getIV() != null)
continue;
values[numValues++] = bnode;
numBNodes++;
}
}
// Buffer a batch and then incrementally flush.
if (queue == null) {
final BatchResult batchResult = new Batch(this, false/* clone */).writeNow();
bnodesResolvedCount += batchResult.getNumBNodesResolved();
batchWriteCount++;
// Reset the state of the buffer (but not the bnodes nor deferred stmts).
_clear();
} else {
if (ft == null || ft.isDone() /* BLZG-1813 */) {
/*
* If the future is done, get the future, and
* propogate any exceptions.
*
* @see BLZG-1813
*/
if (ft != null && ft.isDone()) {
try {
ft.get(); // get the future.
/*
* Fall through. New Future will be created below.
*/
} catch (InterruptedException e) {
// Propagate interrupt
throw new RuntimeException(e);
} catch (ExecutionException ex) {
throw new RuntimeException(ex);
}
}
/*
* Note: Lazily initialized since reset() does not make the
* StatementBuffer object invalid for further use.
*/
ft = new FutureTask(new DrainQueueCallable());
executor.execute(ft);
}
try {
// Blocking put.
queue.put(new Batch(this, true/* clone */));
batchAddCount++;
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
/**
* A utility class to merge {@link Batch}es together while maintaining their
* distinct {@link Value}[]s.
*
* @author bryan
* @see BLZG-1522
* @param
*/
private static class MergeUtility {
/*
* used by merge(). single threaded access.
*/
private int numValues;
private BigdataValue[] values;
private Map distinctTermMap;
MergeUtility() {
}
/**
* Merge a set of batches together.
*
* @param avail
* The available batches.
*
* @throws IllegalArgumentException
* if the argument is null.
* @throws IllegalArgumentException
* if the argument is an empty list.
* @throws IllegalArgumentException
* if the argument does not contain at least two batches.
*/
public Batch merge(final List> avail) {
if (avail == null)
throw new IllegalArgumentException();
if (avail.isEmpty())
throw new IllegalArgumentException();
if (avail.size() < 2)
throw new IllegalArgumentException();
if (distinctTermMap != null) {
// An attempt to reuse a MergeUtility object.
throw new IllegalStateException();
}
/*
* We need to create a new combined Statement[] containing only the
* distinct Values and a new Value[] in which those distinct values
* are entered. This removes duplicates from the Value[] which is
* quite important for throughput. It is not as critical to de-dup
* the Statement[] as duplicate statements are uncommon and do not
* incur much overhead since we will sort the statements before
* writing on the indices.
*
* TODO We could potentially run into problems with a very large
* capacity since the combined size of the values[] (without
* duplicate removal) could exceed an int32 value. But this is not
* likely.
*/
// find maximum size for arrays.
int maxValues = 0;
int maxStmts = 0;
{
for (Batch sb : avail) {
maxValues += sb.numValues;
maxStmts += sb.numStmts;
}
// we will de-dup the values below.
values = new BigdataValue[maxValues];
// set map to find the distinct Values.
distinctTermMap = new HashMap(maxValues);
}
// copy statements, finding distinct Values.
final int numStmts;
final BigdataStatement[] stmts;
{
// we will not de-dup the statements.
stmts = new BigdataStatement[maxStmts];
int n = 0;
for (Batch sb : avail) {
for (int i = 0; i < sb.numStmts; i++, n++) {
// Create new statement using distinct values.
final BigdataStatement stmt = (BigdataStatement) sb.stmts[i];
final BigdataResource s = (BigdataResource) getDistinctTerm(stmt.getSubject());
final BigdataURI p = (BigdataURI) getDistinctTerm(stmt.getPredicate());
final BigdataValue o = getDistinctTerm(stmt.getObject());
final BigdataResource c = stmt.getContext() == null ? null
: (BigdataResource) getDistinctTerm(stmt.getContext());
stmts[n] = s.getValueFactory().createStatement(s, p, o, c, stmt.getStatementType());
}
}
numStmts = n;
}
final Batch sb = avail.get(0);
return new Batch(sb.database, // copy by reference
sb.statementStore, // copy by reference
sb.readOnly, // copy by reference
sb.changeLog, // copy by reference
sb.didWriteCallback, // copy by reference
numValues, // copied the data.
values, // copied the data.
numStmts, // copied the data.
stmts // copied the data.
);
} // merge()
/**
* Canonicalizing mapping for a term when merging {@link Batch}es
* together. This is simpler than the general case since we have already
* handled blank nodes, SIDs, etc. in the outer context.
*
* @param term
* A term.
*
* @return Either the term or the pre-existing term in the {@link Batch}
* with the same data.
*
* @throws IllegalArgumentException
* if the argument is null (so do not pass a null context in
* here!)
*/
private BigdataValue getDistinctTerm(final BigdataValue term) {
if (term == null)
throw new IllegalArgumentException();
// TODO BLZG-1532 (JAVA8) replace with putIfAbsent()
final BigdataValue existingTerm = distinctTermMap.get(term);
if (existingTerm != null) {
/*
* Term already exists, do not add.
*/
return existingTerm;
}
// put the new term in the map.
if (distinctTermMap.put(term, term) != null) {
throw new AssertionError();
}
values[numValues++] = term;
// return the new term.
return term;
}
}
/**
* Result of the {@link Batch} execution, consists of #of statements written
* to the database and #of bnodes, which do have their IVs assigned after
* incremental write
*
* @author kim
*
* @see BLZG-1708
*/
private static class BatchResult {
private final long nwritten;
private final long nBnodesResolved;
public BatchResult(final long nwritten, final long nBnodesResolved) {
this.nwritten = nwritten;
this.nBnodesResolved = nBnodesResolved;
}
public long getNumWritten() {
return nwritten;
}
public long getNumBNodesResolved() {
return nBnodesResolved;
}
}
/**
* A batch of statements together with their distinct values to be written
* onto the database.
*
* @author bryan
*
* @see BLZG-1522
*/
private static class Batch {
/**
* Singleton instance used to indicate that no more elements will be
* added to the queue.
*/
@SuppressWarnings("rawtypes")
private static final Batch> POISON_PILL = new Batch();
/*
* All of these are fields from the outer class. They have either been
* copied by reference or cloned depending on the constructor call.
*
* Note: I have explicitly replicated them here to provide a boundary
* around the state that can be dropped into the queue and the state
* that is being used to absorb statements from the parser. This is why
* Batch is a *static* class.
*
* Note: One benefit of this boundary is that we known that the
* StatementBuffer bnodes map is NOT used from within this class so we
* do not need to worry about concurrency control for that collection.
*/
private final AbstractTripleStore database;
private final AbstractTripleStore statementStore;
private final boolean readOnly;
private final IChangeLog changeLog;
private final IWrittenSPOArray didWriteCallback;
private final int numValues;
private final BigdataValue[] values;
private final int numStmts;
private final BigdataStatement[] stmts;
/**
* Singleton instance constructor.
*/
private Batch() {
database = null;
statementStore = null;
readOnly = true;
changeLog = null;
didWriteCallback = null;
numValues = 0;
values = null;
numStmts = 0;
stmts = null;
}
/**
* Constructor used when merging multiple batches.
*/
private Batch( final AbstractTripleStore database, //
final AbstractTripleStore statementStore, //
final boolean readOnly, //
final IChangeLog changeLog, //
final IWrittenSPOArray didWriteCallback, //
final int numValues, //
final BigdataValue[] values, //
final int numStmts, //
final BigdataStatement[] stmts//
) {
this.database = database;
this.statementStore = statementStore;
this.readOnly = readOnly;
this.changeLog = changeLog;
this.didWriteCallback = didWriteCallback;
this.numValues = numValues;
this.values = values;
this.numStmts = numStmts;
this.stmts = stmts;
}
/**
*
* @param sb
* @param clone
* When true, the backing arrays are cloned in order to allow
* them to be cleared by the caller. When false, the caller
* MUST invoke {@link #writeNow()} synchronously. (This is used
* to make it easier to compare the two approaches without
* introducing any new overhead).
*/
Batch(final StatementBuffer sb, final boolean clone) {
if (sb == null)
throw new IllegalArgumentException();
/*
* All of these fields can be copied by reference. They are not
* carrying any interesting state information from the parser.
*/
this.database = sb.database;
this.statementStore = sb.statementStore;
this.readOnly = sb.readOnly;
this.changeLog = sb.changeLog;
this.didWriteCallback = sb.didWriteCallback;
if (!clone) {
// Copy array references.
this.numValues = sb.numValues;
this.values = sb.values;
this.numStmts = sb.numStmts;
this.stmts = sb.stmts;
} else {
// Clone array data.
this.numValues = sb.numValues;
this.values = new BigdataValue[sb.numValues];
System.arraycopy(sb.values/* src */, 0/* srcPos */, this.values/* dest */, 0/* destPos */,
sb.numValues/* length */);
this.numStmts = sb.numStmts;
this.stmts = new BigdataStatement[sb.numStmts];
System.arraycopy(sb.stmts/* src */, 0/* srcPos */, this.stmts/* dest */, 0/* destPos */,
sb.numStmts);
/*
* The data was cloned, so reset the statement of the buffer in
* the outer context (but not the bnodes nor deferred stmts).
*/
sb._clear();
}
}
/**
* Flush the batch.
*
* @return A summary of the #of statements actually written and blank
* nodes actually resolved.
*/
private BatchResult writeNow() {
final long begin = System.currentTimeMillis();
long nBnodesResolved = 0;
if (log.isInfoEnabled())
log.info("numValues=" + numValues + ", numStmts=" + numStmts);
// Insert terms (batch operation).
if (numValues > 0) {
if (DEBUG) {
for (int i = 0; i < numValues; i++) {
log
.debug("adding term: "
+ values[i]
+ " (iv="
+ values[i].getIV()
+ ")"
+ ((values[i] instanceof BNode) ? "sid="
+ ((BigdataBNode) values[i]).isStatementIdentifier()
: ""));
}
}
// Calculate #of bnodes, which were unresolved
for (BigdataValue v: values) {
if (v instanceof BigdataBNode && v.getIV() == null) {
nBnodesResolved++;
}
}
addTerms(database, values, numValues, readOnly);
// Substract #of bnodes, which remain unresolved
// as a result we have number of bnodes, which were resolved
for (BigdataValue v: values) {
if (v instanceof BigdataBNode && v.getIV() == null) {
nBnodesResolved--;
}
}
if (DEBUG) {
for (int i = 0; i < numValues; i++) {
log
.debug(" added term: "
+ values[i]
+ " (iv="
+ values[i].getIV()
+ ")"
+ ((values[i] instanceof BNode) ? "sid="
+ ((BigdataBNode) values[i]).isStatementIdentifier()
: ""));
}
}
}
// Insert statements (batch operation).
final long nwritten;
if (numStmts > 0) {
if (DEBUG) {
for (int i = 0; i < numStmts; i++) {
log.debug("adding stmt: " + stmts[i]);
}
}
nwritten = addStatements(database, statementStore, stmts, numStmts, changeLog, didWriteCallback);
if (DEBUG) {
for (int i = 0; i < numStmts; i++) {
log.debug(" added stmt: " + stmts[i]);
}
}
} else nwritten = 0;
if (log.isInfoEnabled()) {
final long elapsed = System.currentTimeMillis() - begin;
log.info("numValues=" + numValues + ", numStmts=" + numStmts
+ ", elapsed=" + elapsed + "ms");
}
return new BatchResult(nwritten, nBnodesResolved);
}
static private void addTerms(
final AbstractTripleStore database, //
final BigdataValue[] terms, //
final int numTerms,//
final boolean readOnly//
) {
if(log.isInfoEnabled())
log.info("writing " + numTerms);
if (DEBUG) {
for (int i = 0; i < numTerms; i++) {
log.debug("term: " + terms[i] + ", iv: " + terms[i].getIV());
}
}
final long l =
database.getLexiconRelation().addTerms(terms, numTerms, readOnly);
if (log.isInfoEnabled()) {
log.info("# reported from addTerms: " + l);
}
}
/**
* Adds the statements to each index (batch api, NO truth maintenance).
*
* Pre-conditions: The {s,p,o} term identifiers for each
* {@link BigdataStatement} are defined.
*
* Note: If statement identifiers are enabled and the context position is
* non-null
then it will be unified with the statement
* identifier assigned to that statement. It is an error if the context
* position is a URI (since it can not be unified with the assigned
* statement identifier). It is an error if the context position is a blank
* node which is already bound to a term identifier whose value is different
* from the statement identifier assigned/reported by the {@link #database}.
*
* @param database
* The database that will be used to resolve terms. When
* statementStore is null
, statements will be
* written into this store as well.
* @param statementStore
* When non-null
the statements will be written on
* this store. When null
the statements are written
* onto the database. (This is used to support incremental
* truth maintenance.)
* @param stmts
* An array of statements in any order.
* @param numStmts The number of statements in that array.
* @param changeLog
*
*
* @return The #of statements written on the database.
*/
final private static long addStatements(final AbstractTripleStore database,
final AbstractTripleStore statementStore, final BigdataStatement[] stmts, final int numStmts,
final IChangeLog changeLog,
final IWrittenSPOArray didWriteCallback) {
final SPO[] tmp = new SPO[numStmts];
for (int i = 0; i < tmp.length; i++) {
final BigdataStatement stmt = stmts[i];
final SPO spo = new SPO(stmt);
if (DEBUG)
log.debug("adding: " + stmt.toString() + " (" + spo + ")");
if(!spo.isFullyBound()) {
throw new AssertionError("Not fully bound? : " + spo);
}
tmp[i] = spo;
}
/*
* Note: When handling statement identifiers, we clone tmp[] to avoid a
* side-effect on its order so that we can unify the assigned statement
* identifiers below.
*
* Note: In order to report back the [ISPO#isModified()] flag, we also
* need to clone tmp[] to avoid a side effect on its order. Therefore we
* now always clone tmp[].
*/
// final long nwritten = writeSPOs(sids ? tmp.clone() : tmp, numStmts);
final long nwritten = writeSPOs(database, statementStore, tmp.clone(), numStmts, didWriteCallback);
// if (sids) {
//
// /*
// * Unify each assigned statement identifier with the context
// * position on the corresponding statement.
// */
//
// for (int i = 0; i < numStmts; i++) {
//
// final SPO spo = tmp[i];
//
// final BigdataStatement stmt = stmts[i];
//
// // verify that the BigdataStatement and SPO are the same triple.
// assert stmt.s() == spo.s;
// assert stmt.p() == spo.p;
// assert stmt.o() == spo.o;
//
// final BigdataResource c = stmt.getContext();
//
// if (c == null)
// continue;
//
//// if (c instanceof URI) {
////
//// throw new UnificationException(
//// "URI not permitted in context position when statement identifiers are enabled: "
//// + stmt);
////
//// }
//
// if( c instanceof BNode) {
//
// final IV sid = spo.getStatementIdentifier();
//
// if(c.getIV() != null) {
//
// if (!sid.equals(c.getIV())) {
//
// throw new UnificationException(
// "Can not unify blankNode "
// + c
// + "("
// + c.getIV()
// + ")"
// + " in context position with statement identifier="
// + sid + ": " + stmt + " (" + spo
// + ")");
//
// }
//
// } else {
//
// // assign the statement identifier.
// c.setIV(sid);
//
// if (log.isDebugEnabled()) {
//
// log.debug("Assigned statement identifier: " + c
// + "=" + sid);
//
// }
//
// }
//
// }
//
// }
//
// }
// Copy the state of the isModified() flag
for (int i = 0; i < numStmts; i++) {
if (tmp[i].isModified()) {
stmts[i].setModified(tmp[i].getModified());
if (changeLog != null) {
switch(stmts[i].getModified()) {
case INSERTED:
changeLog.changeEvent(new ChangeRecord(stmts[i], ChangeAction.INSERTED));
break;
case UPDATED:
changeLog.changeEvent(new ChangeRecord(stmts[i], ChangeAction.UPDATED));
break;
case REMOVED:
throw new AssertionError();
default:
break;
}
}
}
}
return nwritten;
}
/**
* Adds the statements to each index (batch api, NO truth maintenance).
*
* @param database
* The database that will be used to resolve terms. When
* statementStore is null
, statements will be
* written into this store as well.
* @param statementStore
* When non-null
the statements will be written on
* this store. When null
the statements are written
* onto the database. (This is used to support incremental
* truth maintenance.)
* @param stmts
* An array of the statements to be written onto the backing
* store.
* @param numStmts
* The number of entries in that array to be written.
*
* @return The #of statements written on the database.
*
* @see AbstractTripleStore#addStatements(AbstractTripleStore, boolean,
* IChunkedOrderedIterator, IElementFilter)
*/
static private long writeSPOs(final AbstractTripleStore database, final AbstractTripleStore statementStore,
final SPO[] stmts, final int numStmts, final IWrittenSPOArray callback) {
final IChunkedOrderedIterator itr = new ChunkedArrayIterator(
numStmts, stmts, null/* keyOrder */);
final AbstractTripleStore sink = statementStore != null ? statementStore
: database;
if (log.isInfoEnabled()) {
log.info("writing " + numStmts + " on "
+ (statementStore != null ? "statementStore" : "database"));
if(DEBUG) {
for (int i = 0; i < numStmts; i++) {
log.debug("spo: " + stmts[i]);
}
}
}
// synchronous write on the target.
final long nwritten = database
.addStatements(sink, false/* copyOnly */, itr, null /* filter */);
if (callback != null) {
callback.didWriteSPOs(stmts, numStmts);
}
return nwritten;
}
} // class Batch
/**
* Add an "explicit" statement to the buffer (flushes on overflow, no
* context).
*
* @param s
* @param p
* @param o
*/
@Override
public void add(final Resource s, final URI p, final Value o) {
add(s, p, o, null, StatementEnum.Explicit);
}
/**
* Add an "explicit" statement to the buffer (flushes on overflow).
*
* @param s
* @param p
* @param o
* @param c
*/
@Override
public void add(final Resource s, final URI p, final Value o, final Resource c) {
add(s, p, o, c, StatementEnum.Explicit);
}
/**
* Add a statement to the buffer (core impl, flushes on overflow).
*
* @param s
* @param p
* @param o
* @param type
*/
@Override
public void add(final Resource s, final URI p, final Value o,
final Resource c, final StatementEnum type) {
if (nearCapacity()) {
// bulk insert the buffered data into the store.
if (true) {
// THIS IS THE CORRECT ACTION!
incrementalWrite();
} else {
/*
* This will flush all blank nodes. It may be necessary on very
* large files. It also resets the blank node and deferred
* statement maps afterwards (since they are set to null by
* reset()).
*/
flush();
bnodes = new HashMap(bufferCapacity);
deferredStmts = new HashSet(stmts.length);
}
}
// add to the buffer.
handleStatement(s, p, o, c, type);
}
@Override
public void add(final Statement e) {
add(e.getSubject(), e.getPredicate(), e.getObject(), e.getContext(),
(e instanceof BigdataStatement ? ((BigdataStatement) e)
.getStatementType() : null));
}
/**
* Returns true if the bufferQueue has less than three slots remaining for
* any of the value arrays (URIs, Literals, or BNodes) or if there are no
* slots remaining in the statements array. Under those conditions adding
* another statement to the bufferQueue could cause an overflow.
*
* @return True if the bufferQueue might overflow if another statement were
* added.
*/
public boolean nearCapacity() {
if (numStmts + 1 > bufferCapacity)
return true;
// This check takes into account dynamically calculated #of unresolved bnodes,
// which will get added to values array while running incrementalWrite
// @see https://jira.blazegraph.com/browse/BLZG-1708
if (numValues + bnodesUnresolvedCount - bnodesResolvedCount + arity > values.length)
return true;
return false;
}
/**
* Canonicalizing mapping for a term.
*
* Note: Blank nodes are made canonical with the scope of the source from
* which the data are being read. See {@link #bnodes}. All other kinds of
* terms are made canonical within the scope of the buffer's current
* contents in order to keep down the demand on the heap with reading either
* very large documents or a series of small documents.
*
* @param term
* A term.
*
* @return Either the term or the pre-existing term in the buffer with the
* same data.
*/
private BigdataValue getDistinctTerm(final BigdataValue term, final boolean addIfAbsent) {
if (term == null)
return null;
if (term instanceof BNode) {
/*
* Canonicalizing map for blank nodes.
*
* Note: This map MUST stay in effect while reading from a given
* source and MUST be cleared (or set to null) before reading from
* another source.
*/
final BigdataBNode bnode = (BigdataBNode)term;
final BigdataStatement stmt = bnode.getStatement();
if (stmt != null) {
bnode.setStatement(valueFactory.createStatement(
(BigdataResource) getDistinctTerm(stmt.getSubject(), true),
(BigdataURI) getDistinctTerm(stmt.getPredicate(), true),
(BigdataValue) getDistinctTerm(stmt.getObject(), true)
));
/*
* Do not "add if absent". This is not a real term, just a
* composition of other terms.
*/
return bnode;
} else {
// the BNode's ID.
final String id = bnode.getID();
if (bnodes == null) {
/*
* Allocating canonicalizing map for blank nodes. Note:
* Using linked hash map since we have to iterate over this
* in order to decide how many resolved and unresolved blank
* nodes remain in the map per
* https://jira.blazegraph.com/browse/BLZG-1708 (DataLoader
* fails with ArrayIndexOutOfBoundsException).
*/
bnodes = new LinkedHashMap(bufferCapacity);
bnodesUnresolvedCount = 0;
bnodesResolvedCount = 0;
// insert this blank node into the map.
bnodes.put(id, bnode);
} else {
// test canonicalizing map for blank nodes.
final BigdataBNode existingBNode = bnodes.get(id);
if (existingBNode != null) {
/*
* Return existing blank node with same ID, do not
* add since not absent.
*/
return existingBNode;
}
// insert this blank node into the map.
bnodes.put(id, bnode);
}
// keep track on #of unresolved IVs, which require
// adding to values array on running incrementalWrite
// @see https://jira.blazegraph.com/browse/BLZG-1708
// (DataLoader fails with ArrayIndexOutOfBoundsException)
if (bnode.getIV() == null) {
bnodesUnresolvedCount++;
}
}
// return term;
} else {
/*
* Other kinds of terms use a map whose scope is limited to the terms
* that are currently in the buffer. This keeps down the heap demand
* when reading very large documents.
*/
// TODO BLZG-1532 (JAVA8) replace with putIfAbsent()
final BigdataValue existingTerm = distinctTermMap.get(term);
if (existingTerm != null) {
// return the pre-existing term.
if(DEBUG) {
log.debug("duplicate: "+term);
}
if (equals(existingTerm, RDF_SUBJECT, RDF_PREDICATE, RDF_OBJECT, RDF_TYPE, RDF_STATEMENT)) {
if (addIfAbsent) {
addTerm(term);
}
}
/*
* Term already exists, do not add.
*/
return existingTerm;
}
if(DEBUG) {
log.debug("new term: "+term);
}
// put the new term in the map.
if (distinctTermMap.put(term, term) != null) {
throw new AssertionError();
}
}
if (addIfAbsent) {
addTerm(term);
}
// return the new term.
return term;
}
private void addTerm(final BigdataValue term) {
if (term == null)
return;
if (term instanceof URI) {
numURIs++;
values[numValues++] = term;
} else if (term instanceof BNode) {
/*
* Handle bnodes separately, in incrementalWrite().
*/
// if (!statementIdentifiers) {
//
// numBNodes++;
//
// values[numValues++] = term;
//
// }
} else {
numLiterals++;
values[numValues++] = term;
}
}
/**
* Adds the values and the statement into the buffer.
*
* @param _s
* The subject.
* @param _p
* The predicate.
* @param _o
* The object.
* @param _c
* The context (may be null).
* @param type
* The statement type.
*
* @throws IndexOutOfBoundsException
* if the buffer capacity is exceeded.
*
* @see #nearCapacity()
*/
protected void handleStatement(Resource _s, URI _p, Value _o, Resource _c,
final StatementEnum type) {
// silently strip context in quads mode. See #1086.
_c = database.isQuads() ? _c : null;
if (DEBUG) {
log.debug("handle stmt: " + _s + ", " + _p + ", " + _o + ", " + _c);
}
// if (arity == 3) c = null;
final BigdataResource s = (BigdataResource)
getDistinctTerm(valueFactory.asValue(_s), true);
final BigdataURI p = (BigdataURI)
getDistinctTerm(valueFactory.asValue(_p), true);
final BigdataValue o =
getDistinctTerm(valueFactory.asValue(_o), true);
final BigdataResource c = (BigdataResource)
getDistinctTerm(valueFactory.asValue(_c), true);
/*
* Form the BigdataStatement object now that we have the bindings.
*/
final BigdataStatement stmt = valueFactory.createStatement(s, p, o, c, type);
/*
* Specifically looking for reification syntax:
* _:sid rdf:type Statement .
* _:sid rdf:subject .
* _:sid rdf:predicate .
* _:sid rdf:object .
*/
if (statementIdentifiers && s instanceof BNode) {
if (equals(p, RDF_SUBJECT, RDF_PREDICATE, RDF_OBJECT)) {
final BigdataBNodeImpl sid = (BigdataBNodeImpl) s;
if (sid.getStatement() != null) {
checkSid(sid, p, o);
log.warn("seeing a duplicate value for " + sid + ": " + p +"=" + o);
return;
}
if (reifiedStmts == null) {
reifiedStmts = new HashMap();
}
final ReifiedStmt reifiedStmt;
if (reifiedStmts.containsKey(sid)) {
reifiedStmt = reifiedStmts.get(sid);
} else {
reifiedStmt = new ReifiedStmt();
reifiedStmts.put(sid, reifiedStmt);
}
reifiedStmt.set(p, o);
if (DEBUG)
log.debug("reified piece: "+stmt);
if (reifiedStmt.isFullyBound(arity)) {
sid.setStatement(reifiedStmt.toStatement(valueFactory));
reifiedStmts.remove(sid);
}
return;
} else if (equals(o, RDF_STATEMENT) && equals(p, RDF_TYPE)) {
/*
* Ignore these statements.
*
* _:sid rdf:type rdf:Statement .
*/
return;
}
}
// add to the buffer.
stmts[numStmts++] = stmt;
numTotalStmts++;
final Future f = ft;
if (f != null && f.isDone()) {
/*
* We are transferring batches to a queue, but the task draining the
* queue is no longer running.
*/
try {
f.get(); // get the future. Expect ExecutionException.
// Nothing thrown, but task draining queue is not running so it
// is still a problem.
throw new RuntimeException("Writer is done?");
} catch (InterruptedException e) {
// Propagate interrupt
Thread.currentThread().interrupt();
} catch (ExecutionException ex) {
throw new RuntimeException(ex);
}
}
// if (c != null && statementIdentifiers && c instanceof BNode) {
//
// ((BigdataBNodeImpl) c).setStatement(stmt);
//
// }
}
private void checkSid(final BigdataBNode sid, final URI p, final Value o) {
final BigdataStatement stmt = sid.getStatement();
if ((p == RDF_SUBJECT && stmt.getSubject() != o) ||
(p == RDF_PREDICATE && stmt.getPredicate() != o) ||
(p == RDF_OBJECT && stmt.getObject() != o)) {
throw new UnificationException("sid cannot refer to multiple statements");
}
}
private boolean equals(final BigdataValue v1, final BigdataValue... v2) {
if (v2.length == 1) {
return _equals(v1, v2[0]);
} else {
for (BigdataValue v : v2) {
if (_equals(v1, v))
return true;
}
return false;
}
}
private boolean _equals(final BigdataValue v1, final BigdataValue v2) {
return v1 == v2;
// if (distinct) {
//
// return v1 == v2;
//
// } else {
//
// return v1.equals(v2);
//
// }
}
private static class ReifiedStmt implements Statement {
/**
*
*/
private static final long serialVersionUID = -7706421769807306702L;
private BigdataResource s;
private BigdataURI p;
private BigdataValue o;
private BigdataResource c;
public ReifiedStmt() {
}
public boolean isFullyBound(final int arity) {
return s != null && p != null && o != null && (arity > 3 ? c != null : true);
}
@Override
public BigdataResource getContext() {
return c;
}
@Override
public BigdataValue getObject() {
return o;
}
@Override
public BigdataURI getPredicate() {
return p;
}
@Override
public BigdataResource getSubject() {
return s;
}
public void set(final URI p, final BigdataValue o) {
if (p.toString().equals(RDF.SUBJECT.toString())) {
setSubject((BigdataResource) o);
} else if (p.toString().equals(RDF.PREDICATE.toString())) {
setPredicate((BigdataURI) o);
} else if (p.toString().equals(RDF.OBJECT.toString())) {
setObject(o);
// } else if (p.equals(RDF.CONTEXT)) {
//
// setPredicate((URI) c);
//
} else {
throw new IllegalArgumentException();
}
}
public void setSubject(final BigdataResource s) {
this.s = s;
}
public void setPredicate(final BigdataURI p) {
this.p = p;
}
public void setObject(final BigdataValue o) {
this.o = o;
}
// public void setContext(final BigdataResource c) {
// this.c = c;
// }
@Override
public String toString() {
return "<" + s + ", " + p + ", " + o + ", " + c + ">";
}
public BigdataStatement toStatement(final BigdataValueFactory vf) {
return vf.createStatement(s, p, o, c);
}
}
}