com.lucidworks.spark.example.hadoop.HdfsToSolrRDDProcessor Maven / Gradle / Ivy

Go to download
package com.lucidworks.spark.example.hadoop;

import com.lucidworks.spark.BatchSizeType;
import com.lucidworks.spark.util.SolrSupport;
import com.lucidworks.spark.SparkApp;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

public class HdfsToSolrRDDProcessor implements SparkApp.RDDProcessor {

    public static Logger log = Logger.getLogger(HdfsToSolrRDDProcessor.class);

    public String getName() {
        return "hdfs-to-solr";
    }

    public Option[] getOptions() {
        return new Option[]{
                Option.builder("hdfsPath")
                        .argName("PATH")
                        .hasArg()
                        .required(false)
                        .desc("HDFS path identifying the directories / files to index")
                        .build(),
                Option.builder("queueSize")
                        .argName("INT")
                        .hasArg()
                        .required(false)
                        .desc("Queue size for ConcurrentUpdateSolrClient; default is 1000")
                        .build(),
                Option.builder("numRunners")
                        .argName("INT")
                        .hasArg()
                        .required(false)
                        .desc("Number of runner threads per ConcurrentUpdateSolrClient instance; default is 2")
                        .build(),
                Option.builder("pollQueueTime")
                        .argName("INT")
                        .hasArg()
                        .required(false)
                        .desc("Number of millis to wait until CUSS sees a doc on the queue before it closes the current request and starts another; default is 20 ms")
                        .build()
        };
    }

    // Benchmarking dataset generated by Solr Scale Toolkit
    private static final String[] pigSchema =
            ("id,integer1_i,integer2_i,long1_l,long2_l,float1_f,float2_f,double1_d,double2_d,timestamp1_tdt," +
                    "timestamp2_tdt,string1_s,string2_s,string3_s,boolean1_b,boolean2_b,text1_en,text2_en,text3_en,random_bucket").split(",");

    public int run(SparkConf conf, CommandLine cli) throws Exception {
        try (JavaSparkContext jsc = new JavaSparkContext(conf)) {
            JavaRDD textFiles = jsc.textFile(cli.getOptionValue("hdfsPath"));
            JavaPairRDD pairs = textFiles.mapToPair(new PairFunction() {
                public Tuple2 call(String line) throws Exception {
                    SolrInputDocument doc = new SolrInputDocument();
                    String[] row = line.split("\t");
                    if (row.length != pigSchema.length)
                        return null;

                    for (int c = 0; c < row.length; c++)
                        if (row[c] != null && row[c].length() > 0)
                            doc.setField(pigSchema[c], row[c]);

                    return new Tuple2<>((String) doc.getFieldValue("id"), doc);
                }
            });

            String zkHost = cli.getOptionValue("zkHost", "localhost:9983");
            String collection = cli.getOptionValue("collection", "collection1");
            int queueSize = Integer.parseInt(cli.getOptionValue("queueSize", "1000"));
            int numRunners = Integer.parseInt(cli.getOptionValue("numRunners", "2"));
            int pollQueueTime = Integer.parseInt(cli.getOptionValue("pollQueueTime", "20"));
            //SolrSupport.streamDocsIntoSolr(zkHost, collection, "id", pairs, queueSize, numRunners, pollQueueTime);
            SolrSupport.indexDocs(zkHost, collection, 100, BatchSizeType.NUM_DOCS, pairs.values().rdd());

            // send a final commit in case soft auto-commits are not enabled
            CloudSolrClient cloudSolrClient = SolrSupport.getCachedCloudClient(zkHost);
            cloudSolrClient.setDefaultCollection(collection);
            cloudSolrClient.commit(true, true);
        }
        return 0;
    }
}