com.bigdata.rdf.util.DumpLexicon Maven / Gradle / Ivy
package com.bigdata.rdf.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Properties;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.IJournal;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Journal;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.lexicon.BlobsIndexHelper;
import com.bigdata.rdf.lexicon.LexiconKeyOrder;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.rdf.sail.BigdataSail;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.service.IBigdataClient;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
/**
* Utility class to dump the TERMS index of a triple store.
*
* @author thompsonbry
*/
public class DumpLexicon {
private final static String REMOTE_ERR_MSG = "Remote Lexicon dumping is not supported by this class."
+ "\nPlease use DumpRemoteLexicon in bigdata-jini."
+ "See BLZG-1370 \n";
private final static String CONFIG_EXT = ".config";
private final static String PROPERTY_EXT = ".properties";
protected DumpLexicon() {
}
protected static void usage() {
System.err.println("usage: (-tuples) ");
}
/**
* Open the {@link IIndexManager} identified by the property file.
*
* @param propertyFile
* The property file (for a standalone bigdata instance) or the
* jini configuration file (for a bigdata federation). The file
* must end with either ".properties" or ".config".
*
* Starting with 1.5.2 the remote dump lexicon capability
* was moved into the bigdata-jini artifact. See BLZG-1370.
*
* @return The {@link IIndexManager}.
*/
protected static IIndexManager openIndexManager(final String propertyFile) {
final File file = new File(propertyFile);
if (!file.exists()) {
throw new RuntimeException("Could not find file: " + file);
}
if (propertyFile.endsWith(CONFIG_EXT)) {
// scale-out.
throw new RuntimeException(REMOTE_ERR_MSG);
} else if (propertyFile.endsWith(PROPERTY_EXT)) {
// local journal.
} else {
/*
* Note: This is a hack, but we are recognizing the jini
* configuration file with a .config extension and the journal
* properties file with a .properties extension.
*/
throw new RuntimeException(
"File must have '.config' or '.properties' extension: "
+ file);
}
final IIndexManager indexManager;
try {
/*
* Note: we only need to specify the FILE when re-opening a journal
* containing a pre-existing KB.
*/
final Properties properties = new Properties();
{
// Read the properties from the file.
final InputStream is = new BufferedInputStream(
new FileInputStream(propertyFile));
try {
properties.load(is);
} finally {
is.close();
}
if (System.getProperty(BigdataSail.Options.FILE) != null) {
// Override/set from the environment.
properties.setProperty(BigdataSail.Options.FILE,
System.getProperty(BigdataSail.Options.FILE));
}
}
final Journal jnl = new Journal(properties);
indexManager = jnl;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
return indexManager;
}
/**
* @param args
* (-tuples) <namespace> <filename>
* where namespace is the namespace of the
* {@link LexiconRelation}. Use kb.lex
if you have
* not overridden the namespace of the
* {@link AbstractTripleStore}.
* where filename is the name of the properties or
* configuration file to be used.
*/
public static void main(final String[] args) {
if (args.length < 2) {
usage();
System.exit(1);
}
boolean showTuples = false;
int i = 0;
for(; i client = null;
try {
client = ((IBigdataFederation>) indexManager)
.getClient();
} catch (IllegalStateException ex) {
// Ignore.
}
if (client != null)
client.disconnect(true/* immediateShutdown */);
}
}
}
}
// /**
// * Dumps the lexicon in a variety of ways (test suites only).
// */
// public StringBuilder dumpTerms() {
//
// final StringBuilder sb = new StringBuilder(Bytes.kilobyte32 * 4);
//
// /**
// * Dumps the terms in term order.
// */
// sb.append("---- terms in term order ----\n");
// for( Iterator itr = termIterator(); itr.hasNext(); ) {
//
// final Value val = itr.next();
//
// if (val == null) {
// sb.append("NullIV");
// } else {
// sb.append(val.toString());
// }
//
// sb.append("\n");
//
// }
//
// return sb;
//
// }
/**
* Dumps the lexicon in a variety of ways.
*
* @param store
*/
static public void dump(final AbstractTripleStore store, final Writer w,
final boolean showBlobs) {
// /*
// * Note: it is no longer true that all terms are stored in the reverse
// * index (BNodes are not). Also, statement identifiers are stored in the
// * forward index, so we can't really write the following assertion
// * anymore.
// */
// // Same #of terms in the forward and reverse indices.
// assertEquals("#terms", store.getIdTermIndex().rangeCount(null, null),
// store.getTermIdIndex().rangeCount(null, null));
final LexiconRelation r = store.getLexiconRelation();
try {
/**
* Dumps the forward mapping (TERM2ID).
*/
{
w.write(r.getFQN(LexiconKeyOrder.TERM2ID)
+ " (forward mapping)\n");
final IIndex ndx = store.getLexiconRelation().getTerm2IdIndex();
final ITupleIterator> itr = ndx.rangeIterator();
while (itr.hasNext()) {
final ITuple> tuple = itr.next();
/*
* The sort key for the term. This is not readily decodable.
* See LexiconKeyBuilder for specifics.
*/
final byte[] key = tuple.getKey();
/*
* Decode the TermIV.
*/
final TermId> iv = (TermId>) IVUtility.decode(tuple
.getValue());
w.write(BytesUtil.toString(key) + ":" + iv + "\n");
}
}
/**
* Dumps the reverse mapping.
*/
{
w.write(r.getFQN(LexiconKeyOrder.ID2TERM)
+ " (reverse mapping)\n");
final IIndex ndx = store.getLexiconRelation().getId2TermIndex();
@SuppressWarnings("unchecked")
final ITupleIterator itr = ndx.rangeIterator();
while (itr.hasNext()) {
final ITuple tuple = itr.next();
final BigdataValue term = tuple.getObject();
w.write(term.getIV() + ":" + term + " (iv=" + term.getIV()
+ ")\n");
}
}
// /**
// * Dumps the term:id index.
// */
// for( Iterator itr =
// store.getLexiconRelation().termsIndexScan(); itr.hasNext(); ) {
//
// System.err.println("term->id : "+itr.next());
//
// }
//
// /**
// * Dumps the id:term index.
// */
// for( Iterator itr =
// store.getLexiconRelation().idTermIndexScan(); itr.hasNext(); ) {
//
// System.err.println("id->term : "+itr.next());
//
// }
//
// /**
// * Dumps the terms in term order.
// */
// for( Iterator itr =
// store.getLexiconRelation().termIterator(); itr.hasNext(); ) {
//
// System.err.println("termOrder : "+itr.next());
//
// }
/*
* Dump the BLOBs index.
*/
w.write(r.getFQN(LexiconKeyOrder.BLOBS) + " (large values)\n");
dumpBlobs(w, showBlobs/* showEntries */, r.getNamespace(), r
.getBlobsIndex());
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Dump the lexicon.
*
* @param r
* The lexicon relation.
*
* @return The dump.
*/
static public Appendable dump(final LexiconRelation r) {
final StringWriter w = new StringWriter(//
100 * Bytes.kilobyte32// initialCapacity
);
w.append(r.getLexiconConfiguration().toString());
w.append("\n");
dump(r.getContainer(), w, true/*showEntries*/);
return w.getBuffer();
}
/**
* Dump the BLOBS index.
*
* @param namespace
* @param ndx
* @return
*/
static public Appendable dumpBlobs(final String namespace, final IIndex ndx) {
final StringWriter w = new StringWriter(//
100 * Bytes.kilobyte32// initialCapacity
);
DumpLexicon.dumpBlobs(w, true/*showEntries*/, namespace, ndx);
return w.getBuffer();
}
/**
* Core implementation for dumping the BLOBS index.
*
* @param w
* Where to write the data.
* @param showEntries
* When true
the individual entries in the TERMS
* index will be reported. When false
only metadata
* about the scanned entries will be reported.
* @param namespace
* The namespace of the {@link LexiconRelation}.
* @param ndx
* The BLOBS index for that {@link LexiconRelation}.
*/
static public void dumpBlobs(final Writer w, final boolean showEntries,
final String namespace, final IIndex ndx) {
final int BIN_SIZE = 256;
final int NBINS = (BlobsIndexHelper.MAX_COUNTER + 1) / BIN_SIZE;
try {
int maxCollisionCounter = 0;
/*
* An array of bins reporting the #of TERMS having the #of collision
* counters for that bin. The bins are each BIN_SIZE wide. There are
* NBINS bins. For a given counter value, the bin is selected by
* floor(counter/binSize).
*
* TODO It would be much more useful to use a sparse array so we can
* report on the distribution at the lower end of the hash collision
* counter range, which is where most of the collisions will be
* found.
*/
final long[] bins = new long[NBINS];
final BigdataValueFactory vf = BigdataValueFactoryImpl
.getInstance(namespace);
final BigdataValueSerializer valSer = vf
.getValueSerializer();
// Used to decode the Values.
final StringBuilder tmp = new StringBuilder();
w.append("fastRangeCount=" + ndx.rangeCount()+"\n");
@SuppressWarnings("unchecked")
final ITupleIterator> itr = ndx.rangeIterator();
long nvisited = 0L;
while (itr.hasNext()) {
final ITuple> tuple = itr.next();
nvisited++;
if (tuple.isNull()) {
if (showEntries) {
w.append("NullIV: key=");
w.append(BytesUtil.toString(tuple.getKey()));
w.append("\n");
}
} else {
final BlobIV> iv = (BlobIV>) IVUtility
.decodeFromOffset(tuple.getKeyBuffer().array(), 0/* offset */);
// new TermId(tuple.getKey());
final BigdataValue value = valSer.deserialize(tuple
.getValueStream(), tmp);
if (showEntries) {
w.append(iv.toString());
w.append(" => ");
w.append(value.toString());
w.append("\n");
}
final int counter = iv.counter();
if (counter > maxCollisionCounter) {
maxCollisionCounter = counter;
}
final int bin = (int) (counter / BIN_SIZE);
bins[bin]++;
}
}
w.append("nvisited=" + nvisited+"\n");
w.append("binSize=" + BIN_SIZE+"\n");
w.append("nbins=" + NBINS + "\n");
// #of non-zero bins.
int nnzero = 0;
for (int bin = 0; bin < NBINS; bin++) {
final long numberInBin = bins[bin];
if (numberInBin == 0)
continue;
w.append("bins[" + bin + "]=" + numberInBin + "\n");
nnzero++;
}
w.append("numNonZeroBins=" + nnzero + "\n");
w.append("maxCollisionCounter=" + maxCollisionCounter + "\n");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy