com.bigdata.rdf.util.DumpLexicon Maven / Gradle / Ivy

Go to download
package com.bigdata.rdf.util;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Properties;

import com.bigdata.btree.IIndex;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.IJournal;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Journal;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.lexicon.BlobsIndexHelper;
import com.bigdata.rdf.lexicon.LexiconKeyOrder;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.rdf.sail.BigdataSail;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.service.IBigdataClient;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;

/**
 * Utility class to dump the TERMS index of a triple store.
 * 
 * @author thompsonbry
 */
public class DumpLexicon {
	
	private final static String REMOTE_ERR_MSG = "Remote Lexicon dumping is not supported by this class."
			+ "\nPlease use DumpRemoteLexicon in bigdata-jini."
			+ "See BLZG-1370 \n";
	
	private final static String CONFIG_EXT = ".config";
	private final static String PROPERTY_EXT = ".properties";

	protected DumpLexicon() {
	}

	protected static void usage() {
        
        System.err.println("usage: (-tuples)  ");

	}

    /**
     * Open the {@link IIndexManager} identified by the property file.
     * 
     * @param propertyFile
     *            The property file (for a standalone bigdata instance) or the
     *            jini configuration file (for a bigdata federation). The file
     *            must end with either ".properties" or ".config".
     *            
     *            Starting with 1.5.2 the remote dump lexicon capability
     *            was moved into the bigdata-jini artifact.  See BLZG-1370.
     *            
     * @return The {@link IIndexManager}.
     */
    protected static IIndexManager openIndexManager(final String propertyFile) {

        final File file = new File(propertyFile);

        if (!file.exists()) {

            throw new RuntimeException("Could not find file: " + file);

        }

        if (propertyFile.endsWith(CONFIG_EXT)) {
            // scale-out.
			throw new RuntimeException(REMOTE_ERR_MSG);
        } else if (propertyFile.endsWith(PROPERTY_EXT)) {
            // local journal.
        } else {
            /*
             * Note: This is a hack, but we are recognizing the jini
             * configuration file with a .config extension and the journal
             * properties file with a .properties extension.
             */
            throw new RuntimeException(
                    "File must have '.config' or '.properties' extension: "
                            + file);
        }

		final IIndexManager indexManager;
		try {

			/*
			 * Note: we only need to specify the FILE when re-opening a journal
			 * containing a pre-existing KB.
			 */
			final Properties properties = new Properties();
			{
				// Read the properties from the file.
				final InputStream is = new BufferedInputStream(
						new FileInputStream(propertyFile));
				try {
					properties.load(is);
				} finally {
					is.close();
				}
				if (System.getProperty(BigdataSail.Options.FILE) != null) {
					// Override/set from the environment.
					properties.setProperty(BigdataSail.Options.FILE,
							System.getProperty(BigdataSail.Options.FILE));
				}
			}

			final Journal jnl = new Journal(properties);

			indexManager = jnl;

		} catch (Exception ex) {

			throw new RuntimeException(ex);

		}

        return indexManager;
        
    }

    /**
	 * @param args
	 *            (-tuples) <namespace> <filename> 

	 *            where namespace is the namespace of the
	 *            {@link LexiconRelation}. Use kb.lex if you have
	 *            not overridden the namespace of the
	 *            {@link AbstractTripleStore}. 

	 *            where filename is the name of the properties or
	 *            configuration file to be used.
	 */
	public static void main(final String[] args) {

		if (args.length < 2) {
			usage();
	        System.exit(1);
        }

        boolean showTuples = false;

        int i = 0;
        
        for(; i client = null;

					try {
					
						client = ((IBigdataFederation) indexManager)
								.getClient();
						
					} catch (IllegalStateException ex) {
						// Ignore.
					}
					
					if (client != null)
						client.disconnect(true/* immediateShutdown */);

				}
				
			}
			
		}

	}

//  /**
//  * Dumps the lexicon in a variety of ways (test suites only).
//  */
// public StringBuilder dumpTerms() {
//
//     final StringBuilder sb = new StringBuilder(Bytes.kilobyte32 * 4);
//
//     /**
//      * Dumps the terms in term order.
//      */
//     sb.append("---- terms in term order ----\n");
//     for( Iterator itr = termIterator(); itr.hasNext(); ) {
//         
//         final Value val = itr.next();
//         
//         if (val == null) {
//             sb.append("NullIV");
//         } else {
//             sb.append(val.toString());
//         }
//         
//         sb.append("\n");
//         
//     }
//     
//     return sb;
//     
// }
    
    /**
     * Dumps the lexicon in a variety of ways.
     * 
     * @param store
     */
    static public void dump(final AbstractTripleStore store, final Writer w,
            final boolean showBlobs) {

//        /*
//         * Note: it is no longer true that all terms are stored in the reverse
//         * index (BNodes are not). Also, statement identifiers are stored in the
//         * forward index, so we can't really write the following assertion
//         * anymore.
//         */
//        // Same #of terms in the forward and reverse indices.
//        assertEquals("#terms", store.getIdTermIndex().rangeCount(null, null),
//                store.getTermIdIndex().rangeCount(null, null));
        

        final LexiconRelation r = store.getLexiconRelation();

        try {

            /**
             * Dumps the forward mapping (TERM2ID).
             */
            {

                w.write(r.getFQN(LexiconKeyOrder.TERM2ID)
                        + " (forward mapping)\n");

                final IIndex ndx = store.getLexiconRelation().getTerm2IdIndex();

                final ITupleIterator itr = ndx.rangeIterator();

                while (itr.hasNext()) {

                    final ITuple tuple = itr.next();

                    /*
                     * The sort key for the term. This is not readily decodable.
                     * See LexiconKeyBuilder for specifics.
                     */
                    final byte[] key = tuple.getKey();

                    /*
                     * Decode the TermIV.
                     */
                    final TermId iv = (TermId) IVUtility.decode(tuple
                            .getValue());

                    w.write(BytesUtil.toString(key) + ":" + iv + "\n");

                }

            }

            /**
             * Dumps the reverse mapping.
             */
            {

                w.write(r.getFQN(LexiconKeyOrder.ID2TERM)
                        + " (reverse mapping)\n");

                final IIndex ndx = store.getLexiconRelation().getId2TermIndex();

                @SuppressWarnings("unchecked")
                final ITupleIterator itr = ndx.rangeIterator();

                while (itr.hasNext()) {

                    final ITuple tuple = itr.next();

                    final BigdataValue term = tuple.getObject();

                    w.write(term.getIV() + ":" + term + " (iv=" + term.getIV()
                            + ")\n");

                }

            }

            // /**
            // * Dumps the term:id index.
            // */
            // for( Iterator itr =
            // store.getLexiconRelation().termsIndexScan(); itr.hasNext(); ) {
            //            
            // System.err.println("term->id : "+itr.next());
            //            
            // }
            //
            // /**
            // * Dumps the id:term index.
            // */
            // for( Iterator itr =
            // store.getLexiconRelation().idTermIndexScan(); itr.hasNext(); ) {
            //            
            // System.err.println("id->term : "+itr.next());
            //            
            // }
            //
            // /**
            // * Dumps the terms in term order.
            // */
            // for( Iterator itr =
            // store.getLexiconRelation().termIterator(); itr.hasNext(); ) {
            //            
            // System.err.println("termOrder : "+itr.next());
            //            
            // }

            /*
             * Dump the BLOBs index.
             */
            w.write(r.getFQN(LexiconKeyOrder.BLOBS) + " (large values)\n");
            
            dumpBlobs(w, showBlobs/* showEntries */, r.getNamespace(), r
                    .getBlobsIndex());

        } catch (IOException ex) {

            throw new RuntimeException(ex);

        }

    }

    /**
     * Dump the lexicon.
     * 
     * @param r
     *            The lexicon relation.
     *            
     * @return The dump.
     */
    static public Appendable dump(final LexiconRelation r) {
    
    	final StringWriter w = new StringWriter(//
    			100 * Bytes.kilobyte32// initialCapacity
    	);
    
    	w.append(r.getLexiconConfiguration().toString());
    	
    	w.append("\n");
    	
    	dump(r.getContainer(), w, true/*showEntries*/);
    	
    	return w.getBuffer();
    
    }

    /**
     * Dump the BLOBS index.
     * 
     * @param namespace
     * @param ndx
     * @return
     */
    static public Appendable dumpBlobs(final String namespace, final IIndex ndx) {
    	
    	final StringWriter w = new StringWriter(//
    			100 * Bytes.kilobyte32// initialCapacity
    	);
    	
    	DumpLexicon.dumpBlobs(w, true/*showEntries*/, namespace, ndx);
    	
    	return w.getBuffer();
    	
    }

    /**
     * Core implementation for dumping the BLOBS index.
     * 
     * @param w
     *            Where to write the data.
     * @param showEntries
     *            When true the individual entries in the TERMS
     *            index will be reported. When false only metadata
     *            about the scanned entries will be reported.
     * @param namespace
     *            The namespace of the {@link LexiconRelation}.
     * @param ndx
     *            The BLOBS index for that {@link LexiconRelation}.
     */
    static public void dumpBlobs(final Writer w, final boolean showEntries,
    		final String namespace, final IIndex ndx) {
    
    	final int BIN_SIZE = 256;
    
        final int NBINS = (BlobsIndexHelper.MAX_COUNTER + 1) / BIN_SIZE;
    
    	try {
    
    		int maxCollisionCounter = 0;
    
            /*
             * An array of bins reporting the #of TERMS having the #of collision
             * counters for that bin. The bins are each BIN_SIZE wide. There are
             * NBINS bins. For a given counter value, the bin is selected by
             * floor(counter/binSize).
             * 
             * TODO It would be much more useful to use a sparse array so we can
             * report on the distribution at the lower end of the hash collision
             * counter range, which is where most of the collisions will be
             * found.
             */
    		final long[] bins = new long[NBINS];
    
    		final BigdataValueFactory vf = BigdataValueFactoryImpl
    				.getInstance(namespace);
    
    		final BigdataValueSerializer valSer = vf
    				.getValueSerializer();
    
    		// Used to decode the Values.
    		final StringBuilder tmp = new StringBuilder();
    
    		w.append("fastRangeCount=" + ndx.rangeCount()+"\n");
    		
    		@SuppressWarnings("unchecked")
    		final ITupleIterator> itr = ndx.rangeIterator();
    
    		long nvisited = 0L;
    		
    		while (itr.hasNext()) {
    
    			final ITuple> tuple = itr.next();
    			
    			nvisited++;
    
    			if (tuple.isNull()) {
    
    				if (showEntries) {
    					w.append("NullIV: key=");
    					w.append(BytesUtil.toString(tuple.getKey()));
    					w.append("\n");
    				}
    
    			} else {
    
    				final BlobIV iv = (BlobIV) IVUtility
    						.decodeFromOffset(tuple.getKeyBuffer().array(), 0/* offset */);
    				// new TermId(tuple.getKey());
    
    				final BigdataValue value = valSer.deserialize(tuple
    						.getValueStream(), tmp);
    
    				if (showEntries) {
    					w.append(iv.toString());
    					w.append(" => ");
    					w.append(value.toString());
    					w.append("\n");
    				}
    
    				final int counter = iv.counter();
    
    				if (counter > maxCollisionCounter) {
    
    					maxCollisionCounter = counter;
    
    				}
    				
    				final int bin = (int) (counter / BIN_SIZE);
    				
    				bins[bin]++;
    
    			}
    
    		}
    
    		w.append("nvisited=" + nvisited+"\n");
    		
    		w.append("binSize=" + BIN_SIZE+"\n");
    
    		w.append("nbins=" + NBINS + "\n");
    
    		// #of non-zero bins.
    		int nnzero = 0;
    		
    		for (int bin = 0; bin < NBINS; bin++) {
    
    			final long numberInBin = bins[bin];
    
    			if (numberInBin == 0)
    				continue;
    
    			w.append("bins[" + bin + "]=" + numberInBin + "\n");
    
    			nnzero++;
    			
    		}
    		
    		w.append("numNonZeroBins=" + nnzero + "\n");
    		
    		w.append("maxCollisionCounter=" + maxCollisionCounter + "\n");
    
    	} catch (IOException e) {
    	
    		throw new RuntimeException(e);
    		
    	}
    
    }

}