com.lucidworks.spark.example.streaming.TwitterToSolrStreamProcessor Maven / Gradle / Ivy

Go to download
package com.lucidworks.spark.example.streaming;

import com.lucidworks.spark.SparkApp;
import com.lucidworks.spark.util.SolrSupport;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.log4j.Logger;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.twitter.TwitterUtils;
import twitter4j.Status;

/**
 * Simple example of indexing tweets into Solr using Spark streaming; be sure to update the
 * twitter4j.properties file on the classpath with your Twitter API credentials.
 */
public class TwitterToSolrStreamProcessor extends SparkApp.StreamProcessor {

    public static Logger log = Logger.getLogger(TwitterToSolrStreamProcessor.class);

    public String getName() {
        return "twitter-to-solr";
    }

    /**
     * Sends a stream of tweets to Solr.
     */
    @Override
    public void setup(JavaStreamingContext jssc, CommandLine cli) throws Exception {
        String filtersArg = cli.getOptionValue("tweetFilters");
        String[] filters = (filtersArg != null) ? filtersArg.split(",") : new String[0];

        // start receiving a stream of tweets ...
        JavaReceiverInputDStream tweets =
                TwitterUtils.createStream(jssc, null, filters);

        String fusionUrl = cli.getOptionValue("fusion");
        if (fusionUrl != null) {
            // just send JSON directly to Fusion
            SolrSupport.sendDStreamOfDocsToFusion(fusionUrl, cli.getOptionValue("fusionCredentials"), tweets.dstream(), batchSize);
        } else {
            // map incoming tweets into PipelineDocument objects for indexing in Solr
            JavaDStream docs = tweets.map(
                    new Function() {

                        /**
                         * Convert a twitter4j Status object into a SolrJ SolrInputDocument
                         */
                        public SolrInputDocument call(Status status) {

                            if (log.isDebugEnabled()) {
                                log.debug("Received tweet: " + status.getId() + ": " + status.getText().replaceAll("\\s+", " "));
                            }

                            // simple mapping from primitives to dynamic Solr fields using reflection
                            SolrInputDocument doc =
                                    SolrSupport.autoMapToSolrInputDoc("tweet-" + status.getId(), status, null);
                            doc.setField("provider_s", "twitter");
                            doc.setField("author_s", status.getUser().getScreenName());
                            doc.setField("type_s", status.isRetweet() ? "echo" : "post");

                            if (log.isDebugEnabled())
                                log.debug("Transformed document: " + doc.toString());
                            return doc;
                        }
                    }
            );

            // when ready, send the docs into a SolrCloud cluster
            SolrSupport.indexDStreamOfDocs(zkHost, collection, batchSize, batchSizeType, docs.dstream());
        }
    }

    public Option[] getOptions() {
        return new Option[]{
                Option.builder("tweetFilters")
                        .argName("LIST")
                        .hasArg()
                        .required(false)
                        .desc("List of Twitter keywords to filter on, separated by commas")
                        .build(),
                Option.builder("fusion")
                        .argName("URL(s)")
                        .hasArg()
                        .required(false)
                        .desc("Fusion endpoint")
                        .build(),
                Option.builder("fusionCredentials")
                        .argName("user:password:realm")
                        .hasArg()
                        .required(false)
                        .desc("Fusion credentials user:password:realm")
                        .build()
        };
    }
}