com.lucidworks.spark.SparkApp Maven / Gradle / Ivy

Go to download
package com.lucidworks.spark;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.Serializable;
import java.net.URL;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import com.lucidworks.spark.example.events.EventsimIndexer;
import com.lucidworks.spark.example.hadoop.HdfsToSolrRDDProcessor;
import com.lucidworks.spark.example.hadoop.Logs2SolrRDDProcessor;
import com.lucidworks.spark.example.query.KMeansAnomaly;
import com.lucidworks.spark.example.query.*;
import com.lucidworks.spark.example.streaming.DocumentFilteringStreamProcessor;
import com.lucidworks.spark.example.streaming.TwitterToSolrStreamProcessor;

import org.apache.commons.cli.*;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.solr.client.solrj.impl.Krb5HttpClientBuilder.LOGIN_CONFIG_PROP;

/**
 * Command-line utility for implementing Spark applications; reduces
 * boilerplate code for implementing multiple Spark applications.
 */
public class SparkApp implements Serializable {

  private static final String sparkExecutorExtraJavaOptionsParam = "spark.executor.extraJavaOptions";

  /**
   * Defines the interface to a Spark RDD processing implementation that can be run from this command-line app.
   */
  public interface RDDProcessor extends Serializable {
    String getName();
    Option[] getOptions();
    int run(SparkConf conf, CommandLine cli) throws Exception;
  }

  /**
   * Defines the interface to a stream processing implementation that can be run from this command-line app.
   */
  public static abstract class StreamProcessor implements RDDProcessor {
    
    protected String zkHost;
    protected String collection;
    protected int batchSize;
    protected BatchSizeType batchSizeType;
    
    public int run(SparkConf conf, CommandLine cli) throws Exception {

      this.zkHost = cli.getOptionValue("zkHost", "localhost:9983");
      this.collection = cli.getOptionValue("collection", "collection1");
      this.batchSize = Integer.parseInt(cli.getOptionValue("batchSize", "10"));
      this.batchSizeType = BatchSizeType.valueOf(cli.getOptionValue("batchSizeType", "NUM_DOCS"));

      // Create a local StreamingContext with two working thread and batch interval
      int batchIntervalSecs = Integer.parseInt(cli.getOptionValue("batchInterval", "1"));
      JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(batchIntervalSecs * 1000L));

      // distribute the pipeline definition file if provided
      if (cli.hasOption("pipeline")) {
        File pipelineFile = new File(cli.getOptionValue("pipeline"));
        if (!pipelineFile.isFile())
          throw new FileNotFoundException(pipelineFile.getAbsolutePath()+" not found!");
        jssc.sparkContext().addFile(cli.getOptionValue("pipeline"));
      }

      setup(jssc, cli);

      jssc.start();              // Start the computation
      jssc.awaitTermination();   // Wait for the computation to terminate

      return 0;
    }

    public String getCollection() {
      return collection;
    }

    public String getZkHost() {
      return zkHost;
    }

    public int getBatchSize() {
      return batchSize;
    }

    /**
     * Setup for stream processing; the actually processing will be started and managed by the base class.
     */
    public abstract void setup(JavaStreamingContext jssc, CommandLine cli) throws Exception;
  }

  public static Logger log = LoggerFactory.getLogger(SparkApp.class);

  /**
   * Runs a stream processor implementation.
   */
  public static void main(String[] args) throws Exception {
    if (args == null || args.length == 0 || args[0] == null || args[0].trim().length() == 0) {
      System.err.println("Invalid command-line args! Must pass the name of a processor to run.\n"
          + "Supported processors:\n");
      displayProcessorOptions(System.err);
      System.exit(1);
    }

    // Determine the processor to run
    RDDProcessor procImpl;
    ClassLoader myCL = SparkApp.class.getClassLoader();
    try {
      Class clazz = (Class) myCL.loadClass(args[0]);
      procImpl = clazz.newInstance();
    } catch (ClassNotFoundException cnfe) {
      procImpl = newProcessor(args[0].trim().toLowerCase(Locale.ROOT));
    }

    // ensure the processor is serializable
    assertSerializable(procImpl);

    String[] procImplArgs = new String[args.length - 1];
    System.arraycopy(args, 1, procImplArgs, 0, procImplArgs.length);

    // process command-line args to configure this application
    CommandLine cli =
        processCommandLineArgs(
          joinCommonAndProcessorOptions(procImpl.getOptions()), procImplArgs);

    SparkConf sparkConf = new SparkConf().setAppName(procImpl.getName());

    //sparkConf.set("spark.serializer", KryoSerializer.class.getName());
    //sparkConf.set("spark.kryo.registrator", LWKryoRegistrator.class.getName());
    sparkConf.set("spark.task.maxFailures", "10");

    setupSolrAuthenticationProps(cli, sparkConf);

    String masterUrl = cli.getOptionValue("master");
    if (masterUrl != null)
      sparkConf.setMaster(masterUrl);

    // Create a local StreamingContext with two working thread and batch interval
    log.info("Running processor "+procImpl.getName());
    int exitCode = procImpl.run(sparkConf, cli);

    System.exit(exitCode);
  }

  protected static void setupSolrAuthenticationProps(CommandLine cli, SparkConf sparkConf) {
    String solrJaasAuthConfig = cli.getOptionValue("solrJaasAuthConfig");
    if (solrJaasAuthConfig == null || solrJaasAuthConfig.isEmpty())
      return; // no jaas auth config provided

    String solrJaasAppName = cli.getOptionValue("solrJaasAppName", "Client");
    String solrJaasOpts = String.format(Locale.ROOT, "-D%s=%s -Dsolr.kerberos.jaas.appname=%s",
        LOGIN_CONFIG_PROP, solrJaasAuthConfig, solrJaasAppName);
    String sparkExecutorExtraJavaOptions =
      sparkConf.contains(sparkExecutorExtraJavaOptionsParam) ? sparkConf.get(sparkExecutorExtraJavaOptionsParam) : null;
    if (sparkExecutorExtraJavaOptions == null) {
      sparkExecutorExtraJavaOptions = solrJaasOpts;
    } else {
      if (!sparkExecutorExtraJavaOptions.contains(LOGIN_CONFIG_PROP)) {
        sparkExecutorExtraJavaOptions += " " + solrJaasOpts;
      }
    }
    sparkConf.set(sparkExecutorExtraJavaOptionsParam, sparkExecutorExtraJavaOptions);
    System.setProperty(LOGIN_CONFIG_PROP, solrJaasAuthConfig);
    System.setProperty("solr.kerberos.jaas.appname", solrJaasAppName);
    log.info("Added {} to {} for authenticating to Solr", solrJaasOpts, sparkExecutorExtraJavaOptionsParam);
  }

  /**
   * Support options common to all tools.
   */
  public static Option[] getCommonOptions() {
    return new Option[] {
      Option.builder()
              .hasArg()
              .required(false)
              .desc("Batch interval (seconds) for streaming applications; default is 1 second")
              .longOpt("batchInterval")
              .build(),
      Option.builder()
              .hasArg()
              .required(false)
              .desc("The master URL to connect to, such as \"local\" to run locally with one thread, \"local[4]\" to run locally with 4 cores, or \"spark://master:7077\" to run on a Spark standalone cluster.")
              .longOpt("master")
              .build(),
      Option.builder()
              .hasArg()
              .required(false)
              .desc("Address of the Zookeeper ensemble; defaults to: localhost:9983")
              .longOpt("zkHost")
              .build(),
      Option.builder()
              .hasArg()
              .required(false)
              .desc("Name of collection; no default")
              .longOpt("collection")
              .build(),
      Option.builder()
              .hasArg()
              .required(false)
              .desc("Number of docs to queue up on the client before sending to Solr; default is 10")
              .longOpt("batchSize")
              .build(),
      Option.builder()
              .hasArg()
              .required(false)
              .desc("For authenticating to Solr using JAAS, sets the '" + LOGIN_CONFIG_PROP + "' system property.")
              .longOpt("solrJaasAuthConfig")
              .build(),
      Option.builder()
              .hasArg()
              .required(false)
              .desc("For authenticating to Solr using JAAS, sets the 'solr.kerberos.jaas.appname' system property; default is Client")
              .longOpt("solrJaasAppName")
              .build()
    };
  }

  // Creates an instance of the requested tool, using classpath scanning if necessary
  private static RDDProcessor newProcessor(String streamProcType) throws Exception {

    streamProcType = streamProcType.trim();

    if ("twitter-to-solr".equals(streamProcType))
      return new TwitterToSolrStreamProcessor();
    else if ("word-count".equals(streamProcType))
      return new WordCount();
    else if ("term-vectors".equals(streamProcType))
      return new ReadTermVectors();
    else if ("docfilter".equals(streamProcType))
      return new DocumentFilteringStreamProcessor();
    else if ("hdfs-to-solr".equals(streamProcType))
      return new HdfsToSolrRDDProcessor();
    else if ("logs2solr".equals(streamProcType))
      return new Logs2SolrRDDProcessor();
    else if ("query-solr-benchmark".equals(streamProcType))
      return new QueryBenchmark();
    else if ("kmeans-anomaly".equals(streamProcType))
      return new KMeansAnomaly();
    else if ("eventsim".equals(streamProcType))
      return new EventsimIndexer();

    // If you add a built-in RDDProcessor to this class, add it here to avoid
    // classpath scanning

    for (Class next : findProcessorClassesInPackage("com.lucidworks.spark")) {
      RDDProcessor streamProc = next.newInstance();
      if (streamProcType.equals(streamProc.getName()))
        return streamProc;
    }

    System.err.println("\n\n "+streamProcType+
            " not supported! Please check your command-line arguments and re-try. \n\n");
    System.exit(1);

    return null; // won't get here
  }

  private static void displayProcessorOptions(PrintStream out) throws Exception {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp("twitter-to-solr", getProcessorOptions(new TwitterToSolrStreamProcessor()));
    formatter.printHelp("word-count", getProcessorOptions(new WordCount()));
    formatter.printHelp("term-vectors", getProcessorOptions(new ReadTermVectors()));
    formatter.printHelp("docfilter", getProcessorOptions(new DocumentFilteringStreamProcessor()));
    formatter.printHelp("hdfs-to-solr", getProcessorOptions(new HdfsToSolrRDDProcessor()));
    formatter.printHelp("logs2solr", getProcessorOptions(new Logs2SolrRDDProcessor()));
    formatter.printHelp("query-solr-benchmark", getProcessorOptions(new QueryBenchmark()));
    formatter.printHelp("kmeans-anomaly", getProcessorOptions(new KMeansAnomaly()));
    formatter.printHelp("eventsim", getProcessorOptions(new EventsimIndexer()));

    List> toolClasses = findProcessorClassesInPackage("com.lucidworks.spark");
    for (Class next : toolClasses) {
      RDDProcessor tool = next.newInstance();
      formatter.printHelp(tool.getName(), getProcessorOptions(tool));
    }
  }

  private static Options getProcessorOptions(RDDProcessor tool) {
    Options options = new Options();
    options.addOption("h", "help", false, "Print this message");
    options.addOption("v", "verbose", false, "Generate verbose log messages");
    Option[] toolOpts = joinCommonAndProcessorOptions(tool.getOptions());
    for (int i = 0; i < toolOpts.length; i++)
      options.addOption(toolOpts[i]);
    return options;
  }

  public static Option[] joinCommonAndProcessorOptions(Option[] toolOpts) {
    return joinOptions(getCommonOptions(), toolOpts);
  }

  public static Option[] joinOptions(Option[] lhs, Option[] rhs) {
    List