All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.rdf.graph.impl.util.GASRunnerBase Maven / Gradle / Ivy

package com.bigdata.rdf.graph.impl.util;

import java.lang.reflect.Constructor;
import java.util.LinkedHashSet;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.log4j.Logger;
import org.openrdf.model.Value;

import com.bigdata.rdf.graph.FrontierEnum;
import com.bigdata.rdf.graph.IGASContext;
import com.bigdata.rdf.graph.IGASEngine;
import com.bigdata.rdf.graph.IGASProgram;
import com.bigdata.rdf.graph.IGASScheduler;
import com.bigdata.rdf.graph.IGASSchedulerImpl;
import com.bigdata.rdf.graph.IGASState;
import com.bigdata.rdf.graph.IGASStats;
import com.bigdata.rdf.graph.IGraphAccessor;
import com.bigdata.rdf.graph.impl.GASEngine;
import com.bigdata.rdf.graph.impl.GASState;
import com.bigdata.rdf.graph.impl.GASStats;

/**
 * Base class for running performance tests.
 * 
 * @param 
 *            The generic type for the per-vertex state. This is scoped to the
 *            computation of the {@link IGASProgram}.
 * @param 
 *            The generic type for the per-edge state. This is scoped to the
 *            computation of the {@link IGASProgram}.
 * @param 
 *            The generic type for the SUM. This is often directly related to
 *            the generic type for the per-edge state, but that is not always
 *            true. The SUM type is scoped to the GATHER + SUM operation (NOT
 *            the computation).
 * 
 * @author Bryan Thompson
 * 
 *         TODO Do we need a different driver if the algorithm always visits all
 *         vertices? For such algorithms, we just run them once per graph
 *         (unless the graph is dynamic).
 */
public abstract class GASRunnerBase implements
        Callable {

    private static final Logger log = Logger.getLogger(GASRunnerBase.class);
    
    /**
     * Configured options for the {@link GASRunner}.
     * 
     * @author Bryan Thompson
     */
    protected class OptionData {
        static public final long DEFAULT_SEED = 217L;
        static public final int DEFAULT_NRUNS = 1;
        static public final int DEFAULT_NSAMPLES = 100;
        static public final int DEFAULT_NTHREADS = 4; // TODO #of hardware threads?
        /**
         * The seed used for the random number generator (default {@value #seed}
         * ).
         */
        public long seed = DEFAULT_SEED;
        /**
         * Random number generated used for sampling the starting vertices. Set
         * by #init().
         */
        public Random r = null;
        /**
         * The #of runs per initial condition. For algorithms that use a single
         * starting vertex (BFS, SSSP, etc.), there will be a total of
         * nruns * nsamples runs. For algorithms that populate the
         * initial frontier with all vertices (e.g., CC, PR), there will be a
         * total of nruns.
         */
        public int nruns = DEFAULT_NRUNS;
        /**
         * The #of random starting vertices to use for algorithms that use
         * a single starting vertex in the initial frontier and otherwise
         * ignored.
         */
        public int nsamples = DEFAULT_NSAMPLES;
        /**
         * The #of threads to use for GATHER and SCATTER operators.
         */
        public int nthreads = DEFAULT_NTHREADS;
        /**
         * The analytic class to be executed.
         */
        public Class> analyticClass;
        /**
         * The {@link IGASSchedulerImpl} class to use.
         * 
         * TODO Override or always? If always, then where to get the default?
         */
        public Class schedulerClassOverride;
        
        /** Set of files to load (may be empty). */
        public final LinkedHashSet loadSet = new LinkedHashSet();
        
        /** The name of the implementation specific configuration file. */
        public String propertyFile; 

        protected OptionData() {
            
        }

        /**
         * Initialize any resources, including the connection to the backend.
         */
        public void init() throws Exception {

            // Setup the random number generator.
            this.r = new Random(seed);

            r = new Random(seed);
            
        }
        
        /**
         * Shutdown any resources, including the connection to the backend.
         * 

* Note: This method must be safe. It may be called if {@link #init()} * fails. It may be called more than once. */ public void shutdown() { } /** * Return trueiff one or more arguments can be parsed * starting at the specified index. * * @param i * The index into the arguments. * @param args * The arguments. * * @return true iff any arguments were recognized. */ public boolean handleArg(final AtomicInteger i, final String[] args) { return false; } /** * Print the optional message on stderr, print the usage information on * stderr, and then force the program to exit with the given status code. * * @param status * The status code. * @param msg * The optional message */ public void usage(final int status, final String msg) { if (msg != null) { System.err.println(msg); } System.err.println("[options] analyticClass propertyFile"); System.exit(status); } /** * Extension hook for reporting at the end of the test run. * * @param sb A buffer into which more information may be appended. */ public void report(final StringBuilder sb) { // NOP } } // class OptionData /** * The configuration metadata for the run. */ private final OptionData opt; /** * Factory for the {@link OptionData}. */ abstract protected OptionData newOptionData(); /** * The {@link OptionData} for the run. */ protected OptionData getOptionData() { return opt; } /** * Factory for the {@link IGASEngine}. */ abstract protected IGASEngine newGASEngine(); /** * Load files into the backend if they can not be assumed to already exist * (a typical pattern is that files are loaded into an empty KB instance, * but not loaded into a pre-existing one). * * @throws Exception */ abstract protected void loadFiles() throws Exception; /** * Run a GAS analytic against some data set. * * @param args * USAGE:
* (options) analyticClass propertyFile *

* Where: *

*
propertyFile
*
The implementation specific property file or other type of * configuration file.
*
* and options are any of: *
*
-nthreads
*
The #of threads which will be used for GATHER and SCATTER * operations (default {@value OptionData#DEFAULT_NTHREADS}).
*
-nruns
*
The #of times that the algorithm will be run. For * algorithms that are initialized with a single starting vertex * drawn from a random sample, total number of runs is * nruns * nsamples. For algorithms that are * initialize with either all vertices, there will be a total of * nruns runs. (default {@value OptionData#DEFAULT_NRUNS})
*
-nsamples
*
For algorithms that use a single starting vertex (such as * BFS, SSSP, etc.), this is the #of starting vertices that will * be randomly selected. The sampled vertices will have at least * one out-edge or in-edge as appropriate based on the * {@link IGASProgram}. For algorithm will be run ONCE for EACH * sampled vertex. This parameter is ignored for algorithms that * initialize the frontier with all vertices (PR, CC, etc). * (default {@value OptionData#DEFAULT_NSAMPLES})
*
-seed
*
The seed for the random number generator (default * {@value OptionData#DEFAULT_SEED}).
*
-schedulerClass
*
Override the default {@link IGASScheduler}. Class must * implement {@link IGASSchedulerImpl}.
*
-load
*
Loads the named resource IFF the KB is empty (or does not * exist) at the time this utility is executed. This option may * appear multiple times. The resources will be searched for as * URLs, on the CLASSPATH, and in the file system.
*

* @throws ClassNotFoundException */ public GASRunnerBase(final String[] args) throws ClassNotFoundException { final OptionData opt = newOptionData(); /* * Handle all arguments starting with "-". These should appear before * any non-option arguments to the program. */ final AtomicInteger i = new AtomicInteger(0); while (i.get() < args.length) { final String arg = args[i.get()]; if (arg.startsWith("-")) { if (arg.equals("-seed")) { opt.seed = Long.valueOf(args[i.incrementAndGet()]); } else if (arg.equals("-nruns")) { final String s = args[i.incrementAndGet()]; opt.nruns = Integer.valueOf(s); if (opt.nruns <= 0) { opt.usage(1/* status */, "-nruns must be positive, not: " + s); } } else if (arg.equals("-nsamples")) { final String s = args[i.incrementAndGet()]; opt.nsamples = Integer.valueOf(s); if (opt.nsamples <= 0) { opt.usage(1/* status */, "-nsamples must be positive, not: " + s); } } else if (arg.equals("-nthreads")) { final String s = args[i.incrementAndGet()]; opt.nthreads = Integer.valueOf(s); if (opt.nthreads < 0) { opt.usage(1/* status */, "-nthreads must be non-negative, not: " + s); } } else if (arg.equals("-schedulerClass")) { final String s = args[i.incrementAndGet()]; opt.schedulerClassOverride = (Class) Class.forName(s); } else if (arg.equals("-load")) { final String s = args[i.incrementAndGet()]; opt.loadSet.add(s); } else { if (!opt.handleArg(i, args)) { opt.usage(1/* status */, "Unknown argument: " + arg); } } } else { break; } i.incrementAndGet(); } /* * Check for the remaining (required) argument(s). */ final int nremaining = args.length - i.get(); if (nremaining != 2) { /* * There are either too many or too few arguments remaining. */ opt.usage(1/* status */, nremaining < 1 ? "Too few arguments." : "Too many arguments"); } /* * The analytic to be executed. */ { final String s = args[i.getAndIncrement()]; opt.analyticClass = (Class>) Class .forName(s); } /* * Property file. */ opt.propertyFile = args[i.getAndIncrement()]; this.opt = opt; // assign options. } /** * Return the object used to access the as-configured graph. */ abstract protected IGraphAccessor newGraphAccessor(); /** * Return an instance of the {@link IGASProgram} to be evaluated. */ protected IGASProgram newGASProgram() { final Class> cls = (Class>)opt.analyticClass; try { final Constructor> ctor = cls .getConstructor(new Class[] {}); final IGASProgram gasProgram = ctor .newInstance(new Object[] {}); return gasProgram; } catch (Exception e) { throw new RuntimeException(e); } } /** * Run the test. *

* This provides a safe pattern for either loading data into a temporary * journal, which is then destroyed, or using an exiting journal and * optionally loading in some data set. When we load the data the journal is * destroyed afterwards and when the journal is pre-existing and we neither * load the data nor destroy the journal. This has to do with the effective * BufferMode (if transient) and whether the file is specified and whether a * temporary file is created (CREATE_TEMP_FILE). If we do our own file * create if the effective buffer mode is non-transient, then we can get all * this information. */ @Override final public IGASStats call() throws Exception { try { // initialize backend / connection to backend. opt.init(); // Load data sets loadFiles(); // Run GAS program. return runAnalytic(); } finally { // Shutdown backend / connection to backend. opt.shutdown(); } } /** * Run the analytic. * * @return The performance statistics for the run. * * @throws Exception */ final protected IGASStats runAnalytic() throws Exception { final IGASEngine gasEngine = newGASEngine(); try { if (opt.schedulerClassOverride != null) { ((GASEngine) gasEngine) .setSchedulerClass(opt.schedulerClassOverride); } final IGASProgram gasProgram = newGASProgram(); final IGraphAccessor graphAccessor = newGraphAccessor(); final IGASContext gasContext = gasEngine.newGASContext( graphAccessor, gasProgram); final IGASState gasState = gasContext.getGASState(); final FrontierEnum frontierEnum = gasProgram .getInitialFrontierEnum(); /* * TODO Should be customized if we do not want to use the default * behavior (sample is drawn from distribution containing all * vertices versus a subset of the vertices). */ final VertexDistribution dist = frontierEnum .equals(FrontierEnum.SingleVertex) ? graphAccessor .getDistribution(opt.r) : null; /* * FIXME We need to introduce an abstraction that will allow us to * indicate whether an analytic (a) runs one or more times with a * pre-populated frontier that is a single vertex choosen from a * distribution, (b) runs one or more times using a sample of * vertices; or (c) runs one or more times using all vertices. * * The requirment to use (a) a single vertex as the starting point; * (b) a sample of vertices; or (c) all vertices arises from the * nature of the specific IGASProgram. For both a single vertex and * a sample of vertices, the initial value must come from a policy * specified when the program is executed. For algorithms that * normally operate on all vertices, it sometimes permissible to * instead specify a sample of vertices. However, this may result in * only the connected component(s) that span the sample being * utilized by the computation. * * The requirement to run once or multiple times arises from the * desired to characterize the variance in the performance of the * IGASProgram either as a function of the sampled vertex (or * vertices) in the initial frontier and/or as a function of the * runtime variation for a given initial frontier. * * Thus, this could really be broken into two distinct parameters: * #of trials per condition (to measure the variance in the * runtime), and #of conditions (to measure the variance as a * function of graph and the initial frontier). * * The question of how we filter the initial vertices (in terms of * ensuring that they have at least one out-edge or in-edge) is part * of the same set of concerns around how to obtain a sample of * vertices from the distribution. */ final Value[] sampled; { switch (frontierEnum) { case SingleVertex: sampled = dist.getWeightedSample(opt.nsamples, gasProgram.getSampleEdgesFilter()); break; case AllVertices: // All vertices will be used. Do not sample anything. sampled = null; break; default: throw new AssertionError(); } } final IGASStats total = new GASStats(); /* * The #of vertices that were not connected for that analytic across * all trials. */ long nunconnected = 0; for (int run = 0; run < opt.nruns; run++) { if (frontierEnum == FrontierEnum.AllVertices) { // Run analytic. final IGASStats stats = (IGASStats) gasContext.call(); total.add(stats); if (log.isInfoEnabled()) { log.info("Run complete: stats(sample)=" + stats); } } else { /* * The initial frontier is a single vertex. Choose it * from the sampled vertices. */ for (int i = 0; i < sampled.length; i++) { final Value startingVertex = sampled[i]; gasState.setFrontier(gasContext, startingVertex); // Run analytic. final IGASStats stats = (IGASStats) gasContext.call(); if (stats.getFrontierSize() == 1) { /* * The starting vertex was not actually connected to any * other vertices by the traversal performed by the GAS * program. */ if (log.isInfoEnabled()) log.info("Ignoring unconnected startingVertex: " + startingVertex + ", stats=" + stats); nunconnected++; continue; } total.add(stats); if (log.isInfoEnabled()) { log.info("Run complete: vertex[" + i + "] of " + sampled.length + " : startingVertex=" + startingVertex + ", stats(sample)=" + stats); } } // next starting vertex in sample. } // end single starting vertex run. } // next run. // Total over all sampled vertices. final StringBuilder sb = new StringBuilder(); sb.append("TOTAL"); sb.append(": analytic=" + gasProgram.getClass().getSimpleName()); sb.append(", nseed=" + opt.seed); sb.append(", nruns=" + opt.nruns); // #runs (per sample if sampling) sb.append(", nsamples=" + opt.nsamples); // #desired samples sb.append(", nsampled=" + (sampled == null ? "N/A" : sampled.length));// #actually sampled sb.append(", distSize=" + (dist==null?"N/A":dist.size()));// #available for sampling. sb.append(", nunconnected=" + nunconnected);// #unconnected vertices. sb.append(", nthreads=" + opt.nthreads); sb.append(", scheduler=" + ((GASState)gasState).getScheduler().getClass().getSimpleName()); sb.append(", gasEngine=" + gasEngine.getClass().getSimpleName()); opt.report(sb); // extension hook. // performance results. sb.append(", stats(total)=" + total); System.out.println(sb); return total; } finally { gasEngine.shutdownNow(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy