com.lucidworks.spark.SparkApp Maven / Gradle / Ivy
package com.lucidworks.spark;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.Serializable;
import java.net.URL;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import com.lucidworks.spark.example.events.EventsimIndexer;
import com.lucidworks.spark.example.hadoop.HdfsToSolrRDDProcessor;
import com.lucidworks.spark.example.hadoop.Logs2SolrRDDProcessor;
import com.lucidworks.spark.example.query.KMeansAnomaly;
import com.lucidworks.spark.example.query.*;
import com.lucidworks.spark.example.streaming.DocumentFilteringStreamProcessor;
import com.lucidworks.spark.example.streaming.TwitterToSolrStreamProcessor;
import org.apache.commons.cli.*;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.client.solrj.impl.Krb5HttpClientBuilder.LOGIN_CONFIG_PROP;
/**
* Command-line utility for implementing Spark applications; reduces
* boilerplate code for implementing multiple Spark applications.
*/
public class SparkApp implements Serializable {
private static final String sparkExecutorExtraJavaOptionsParam = "spark.executor.extraJavaOptions";
/**
* Defines the interface to a Spark RDD processing implementation that can be run from this command-line app.
*/
public interface RDDProcessor extends Serializable {
String getName();
Option[] getOptions();
int run(SparkConf conf, CommandLine cli) throws Exception;
}
/**
* Defines the interface to a stream processing implementation that can be run from this command-line app.
*/
public static abstract class StreamProcessor implements RDDProcessor {
protected String zkHost;
protected String collection;
protected int batchSize;
protected BatchSizeType batchSizeType;
public int run(SparkConf conf, CommandLine cli) throws Exception {
this.zkHost = cli.getOptionValue("zkHost", "localhost:9983");
this.collection = cli.getOptionValue("collection", "collection1");
this.batchSize = Integer.parseInt(cli.getOptionValue("batchSize", "10"));
this.batchSizeType = BatchSizeType.valueOf(cli.getOptionValue("batchSizeType", "NUM_DOCS"));
// Create a local StreamingContext with two working thread and batch interval
int batchIntervalSecs = Integer.parseInt(cli.getOptionValue("batchInterval", "1"));
JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(batchIntervalSecs * 1000L));
// distribute the pipeline definition file if provided
if (cli.hasOption("pipeline")) {
File pipelineFile = new File(cli.getOptionValue("pipeline"));
if (!pipelineFile.isFile())
throw new FileNotFoundException(pipelineFile.getAbsolutePath()+" not found!");
jssc.sparkContext().addFile(cli.getOptionValue("pipeline"));
}
setup(jssc, cli);
jssc.start(); // Start the computation
jssc.awaitTermination(); // Wait for the computation to terminate
return 0;
}
public String getCollection() {
return collection;
}
public String getZkHost() {
return zkHost;
}
public int getBatchSize() {
return batchSize;
}
/**
* Setup for stream processing; the actually processing will be started and managed by the base class.
*/
public abstract void setup(JavaStreamingContext jssc, CommandLine cli) throws Exception;
}
public static Logger log = LoggerFactory.getLogger(SparkApp.class);
/**
* Runs a stream processor implementation.
*/
public static void main(String[] args) throws Exception {
if (args == null || args.length == 0 || args[0] == null || args[0].trim().length() == 0) {
System.err.println("Invalid command-line args! Must pass the name of a processor to run.\n"
+ "Supported processors:\n");
displayProcessorOptions(System.err);
System.exit(1);
}
// Determine the processor to run
RDDProcessor procImpl;
ClassLoader myCL = SparkApp.class.getClassLoader();
try {
Class extends RDDProcessor> clazz = (Class extends RDDProcessor>) myCL.loadClass(args[0]);
procImpl = clazz.newInstance();
} catch (ClassNotFoundException cnfe) {
procImpl = newProcessor(args[0].trim().toLowerCase(Locale.ROOT));
}
// ensure the processor is serializable
assertSerializable(procImpl);
String[] procImplArgs = new String[args.length - 1];
System.arraycopy(args, 1, procImplArgs, 0, procImplArgs.length);
// process command-line args to configure this application
CommandLine cli =
processCommandLineArgs(
joinCommonAndProcessorOptions(procImpl.getOptions()), procImplArgs);
SparkConf sparkConf = new SparkConf().setAppName(procImpl.getName());
//sparkConf.set("spark.serializer", KryoSerializer.class.getName());
//sparkConf.set("spark.kryo.registrator", LWKryoRegistrator.class.getName());
sparkConf.set("spark.task.maxFailures", "10");
setupSolrAuthenticationProps(cli, sparkConf);
String masterUrl = cli.getOptionValue("master");
if (masterUrl != null)
sparkConf.setMaster(masterUrl);
// Create a local StreamingContext with two working thread and batch interval
log.info("Running processor "+procImpl.getName());
int exitCode = procImpl.run(sparkConf, cli);
System.exit(exitCode);
}
protected static void setupSolrAuthenticationProps(CommandLine cli, SparkConf sparkConf) {
String solrJaasAuthConfig = cli.getOptionValue("solrJaasAuthConfig");
if (solrJaasAuthConfig == null || solrJaasAuthConfig.isEmpty())
return; // no jaas auth config provided
String solrJaasAppName = cli.getOptionValue("solrJaasAppName", "Client");
String solrJaasOpts = String.format(Locale.ROOT, "-D%s=%s -Dsolr.kerberos.jaas.appname=%s",
LOGIN_CONFIG_PROP, solrJaasAuthConfig, solrJaasAppName);
String sparkExecutorExtraJavaOptions =
sparkConf.contains(sparkExecutorExtraJavaOptionsParam) ? sparkConf.get(sparkExecutorExtraJavaOptionsParam) : null;
if (sparkExecutorExtraJavaOptions == null) {
sparkExecutorExtraJavaOptions = solrJaasOpts;
} else {
if (!sparkExecutorExtraJavaOptions.contains(LOGIN_CONFIG_PROP)) {
sparkExecutorExtraJavaOptions += " " + solrJaasOpts;
}
}
sparkConf.set(sparkExecutorExtraJavaOptionsParam, sparkExecutorExtraJavaOptions);
System.setProperty(LOGIN_CONFIG_PROP, solrJaasAuthConfig);
System.setProperty("solr.kerberos.jaas.appname", solrJaasAppName);
log.info("Added {} to {} for authenticating to Solr", solrJaasOpts, sparkExecutorExtraJavaOptionsParam);
}
/**
* Support options common to all tools.
*/
public static Option[] getCommonOptions() {
return new Option[] {
Option.builder()
.hasArg()
.required(false)
.desc("Batch interval (seconds) for streaming applications; default is 1 second")
.longOpt("batchInterval")
.build(),
Option.builder()
.hasArg()
.required(false)
.desc("The master URL to connect to, such as \"local\" to run locally with one thread, \"local[4]\" to run locally with 4 cores, or \"spark://master:7077\" to run on a Spark standalone cluster.")
.longOpt("master")
.build(),
Option.builder()
.hasArg()
.required(false)
.desc("Address of the Zookeeper ensemble; defaults to: localhost:9983")
.longOpt("zkHost")
.build(),
Option.builder()
.hasArg()
.required(false)
.desc("Name of collection; no default")
.longOpt("collection")
.build(),
Option.builder()
.hasArg()
.required(false)
.desc("Number of docs to queue up on the client before sending to Solr; default is 10")
.longOpt("batchSize")
.build(),
Option.builder()
.hasArg()
.required(false)
.desc("For authenticating to Solr using JAAS, sets the '" + LOGIN_CONFIG_PROP + "' system property.")
.longOpt("solrJaasAuthConfig")
.build(),
Option.builder()
.hasArg()
.required(false)
.desc("For authenticating to Solr using JAAS, sets the 'solr.kerberos.jaas.appname' system property; default is Client")
.longOpt("solrJaasAppName")
.build()
};
}
// Creates an instance of the requested tool, using classpath scanning if necessary
private static RDDProcessor newProcessor(String streamProcType) throws Exception {
streamProcType = streamProcType.trim();
if ("twitter-to-solr".equals(streamProcType))
return new TwitterToSolrStreamProcessor();
else if ("word-count".equals(streamProcType))
return new WordCount();
else if ("term-vectors".equals(streamProcType))
return new ReadTermVectors();
else if ("docfilter".equals(streamProcType))
return new DocumentFilteringStreamProcessor();
else if ("hdfs-to-solr".equals(streamProcType))
return new HdfsToSolrRDDProcessor();
else if ("logs2solr".equals(streamProcType))
return new Logs2SolrRDDProcessor();
else if ("query-solr-benchmark".equals(streamProcType))
return new QueryBenchmark();
else if ("kmeans-anomaly".equals(streamProcType))
return new KMeansAnomaly();
else if ("eventsim".equals(streamProcType))
return new EventsimIndexer();
// If you add a built-in RDDProcessor to this class, add it here to avoid
// classpath scanning
for (Class next : findProcessorClassesInPackage("com.lucidworks.spark")) {
RDDProcessor streamProc = next.newInstance();
if (streamProcType.equals(streamProc.getName()))
return streamProc;
}
System.err.println("\n\n "+streamProcType+
" not supported! Please check your command-line arguments and re-try. \n\n");
System.exit(1);
return null; // won't get here
}
private static void displayProcessorOptions(PrintStream out) throws Exception {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("twitter-to-solr", getProcessorOptions(new TwitterToSolrStreamProcessor()));
formatter.printHelp("word-count", getProcessorOptions(new WordCount()));
formatter.printHelp("term-vectors", getProcessorOptions(new ReadTermVectors()));
formatter.printHelp("docfilter", getProcessorOptions(new DocumentFilteringStreamProcessor()));
formatter.printHelp("hdfs-to-solr", getProcessorOptions(new HdfsToSolrRDDProcessor()));
formatter.printHelp("logs2solr", getProcessorOptions(new Logs2SolrRDDProcessor()));
formatter.printHelp("query-solr-benchmark", getProcessorOptions(new QueryBenchmark()));
formatter.printHelp("kmeans-anomaly", getProcessorOptions(new KMeansAnomaly()));
formatter.printHelp("eventsim", getProcessorOptions(new EventsimIndexer()));
List> toolClasses = findProcessorClassesInPackage("com.lucidworks.spark");
for (Class next : toolClasses) {
RDDProcessor tool = next.newInstance();
formatter.printHelp(tool.getName(), getProcessorOptions(tool));
}
}
private static Options getProcessorOptions(RDDProcessor tool) {
Options options = new Options();
options.addOption("h", "help", false, "Print this message");
options.addOption("v", "verbose", false, "Generate verbose log messages");
Option[] toolOpts = joinCommonAndProcessorOptions(tool.getOptions());
for (int i = 0; i < toolOpts.length; i++)
options.addOption(toolOpts[i]);
return options;
}
public static Option[] joinCommonAndProcessorOptions(Option[] toolOpts) {
return joinOptions(getCommonOptions(), toolOpts);
}
public static Option[] joinOptions(Option[] lhs, Option[] rhs) {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy