com.lucidworks.spark.example.ml.NewsgroupsIndexer.scala Maven / Gradle / Ivy

Go to download
package com.lucidworks.spark.example.ml

import java.net.URI
import java.util.Locale

import com.lucidworks.spark.{LazyLogging, SparkApp}
import com.lucidworks.spark.util.SolrSupport
import org.apache.commons.cli.CommandLine
import org.apache.commons.cli.Option.{builder => OptionBuilder}
import org.apache.solr.common.SolrInputDocument
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.input.PortableDataStream
import org.joda.time.DateTimeZone
import org.joda.time.format.{DateTimeFormat, DateTimeFormatterBuilder, ISODateTimeFormat}

import collection.mutable.ListBuffer
import scala.collection.convert.ImplicitConversions._
import scala.io.Source
import scala.util.control.NonFatal

/** Example application to index each article in the 20 newsgroups data as a Solr document.
  * The 20 newsgroups data is downloadable from [[http://qwone.com/~jason/20Newsgroups/]].
  *
  * Articles in any of the three archives available there can be indexed,
  * after first downloading it from the above page and unpacking it.
  *
  * The path you supply as an argument to the `--path` cmdline option (see the
  * "Example invocation" section below) should be the directory containing the newsgroup
  * folders.  All files found recursively under this path will be indexed to Solr.
  *
  * == Prerequisites ==
  *
  * Start Solr in cloud mode, and create a target collection, e.g. (after downloading the
  * binary Solr distribution - see [[https://lucene.apache.org/solr/]] - then unpacking and
  * changing to the unpacked root directory, e.g. `solr-5.4.1/`):
  *
  * {{{
  *   bin/solr -c && bin/solr create -c testing -shards 2
  * }}}
  *
  * == Document fields ==
  *
  * Each header present in the newsgroup articles will be indexed to a Solr dynamic field
  * name prefixed with the header name, e.g. Subject: text will be indexed into a field
  * named Subject_txt_en`.
  *
  * Note that the set of headers in each of the three available archives is different; details
  * are on the download page above.
  *
  * The body of each article will be indexed into the `content_txt_en` field.
  *
  * The `newsgroup_s` field will contain the name of the article's parent directory.
  *
  * The `id` field value will be in the format `newsgroup_articlenum`, e.g. "comp.graphics_38659",
  * where `newsgroup` is the name of the article's parent directory, and `articlenum` is the
  * article filename.
  *
  * The `filepath_s` field will contain the full path of the article source file.
  *
  * If you downloaded the `20news-19997.tar.gz` archive, the only one with the Date: header,
  * dates will be indexed into two fields: the `Date_s` field will contain the original Date:
  * text, and the `Date_tdt` field will contain the date reformatted in ISO-8601 format.
  *
  * == Example invocation ==
  *
  * You must first run `mvn -DskipTests package` in the spark-solr project, and you must download
  * a Spark 1.6.1 binary distribution and point the environment variable `$SPARK_HOME`
  * to the unpacked distribution directory.
  *
  * {{{
  *   $SPARK_HOME/bin/spark-submit --master 'local[2]' --class com.lucidworks.spark.SparkApp \
  *   target/spark-solr-2.0.0-SNAPSHOT-shaded.jar newsgroups2solr -zkHost localhost:9983     \
  *   -collection ml20news -path /relative/or/absolute/path/to/20news-18828`
  * }}}
  *
  * To see a description of all available options, run the following:
  *
  * {{{
  *   $SPARK_HOME/bin/spark-submit --class com.lucidworks.spark.SparkApp \
  *   target/spark-solr-2.0.0-SNAPSHOT-shaded.jar newsgroups2solr --help
  * }}}
  */
class NewsgroupsIndexer extends SparkApp.RDDProcessor with LazyLogging {
  import NewsgroupsIndexer._
  def getName = "newsgroups2solr"
  def getOptions = Array(
    OptionBuilder().longOpt("path").hasArg.argName("PATH").required
      .desc("Path from which to recursively load newsgroup articles").build,
    OptionBuilder().longOpt("collection").hasArg.argName("NAME").required(false)
      .desc("Target Solr collection; default: $DefaultCollection").build)
  def run(conf: SparkConf, cli: CommandLine): Int = {
    val path = cli.getOptionValue("path")
    val collection = cli.getOptionValue("collection", DefaultCollection)
    val zkHost = cli.getOptionValue("zkHost", DefaultZkHost)
    val batchSize = cli.getOptionValue("batchSize", DefaultBatchSize).toInt
    val sc = new SparkContext(conf)
    sc.hadoopConfiguration.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true)
    // Use binaryFiles() because wholeTextFiles() assumes files are UTF-8, but article encoding is Latin-1
    sc.binaryFiles(path).foreachPartition(rows => {
      var numDocs = 0
      val solrServer = SolrSupport.getCachedCloudClient(zkHost)
      val batch = ListBuffer.empty[SolrInputDocument]
      def sendBatch(): Unit = {
        SolrSupport.sendBatchToSolr(solrServer, collection, batch.toList)
        numDocs += batch.size
        logger.info(s"Sent $numDocs docs to Solr from $path")
        batch.clear()
      }
      rows.foreach(row => {  // each row is a Tuple: (path, PortableDataStream)
        val (group, articleNum) = parseFilePath(path, row._1)
        val doc = loadNewsgroupArticle(row._2)
        // Newsgroup name is the parent directory; if this file's path doesn't have a parent directory
        // after removing the source path, use the first listed newsgroup from the article content
        val newsgroup = group.getOrElse(doc.getFieldValues("Newsgroups_ss").head)
        doc.addField("id", s"${newsgroup}_$articleNum")
        doc.addField("newsgroup_s", newsgroup)
        doc.addField("filepath_s", row._1)
        batch += doc
        if (batch.size >= batchSize) sendBatch()
      })
      if (batch.nonEmpty) sendBatch()
    })
    // Explicit commit to make sure all docs are visible
    val solrServer = SolrSupport.getCachedCloudClient(zkHost)
    solrServer.commit(collection, true, true)

    sc.stop()
    0
  }
  def parseFilePath(basePath: String, filePath: String): Tuple2[Option[String],String] = {
    val segments = new URI(basePath).relativize(new URI(filePath)).getPath.split('/').reverse
    val articleNum = segments(0) // Trailing segment is the filename, which is the article number
    // Parent segment, if there is one, is the identified newsgroup
    val newsgroup = if (segments.length > 1) Some(segments(1)) else None
    (newsgroup, articleNum)
  }
  def loadNewsgroupArticle(stream: PortableDataStream): SolrInputDocument = {
    val doc = new SolrInputDocument
    val inputStream = stream.open
    try {
      var noMoreHeaders = false
      val content = new StringBuilder
      for (line <- Source.fromInputStream(inputStream, "ISO-8859-1").getLines) {
        var cleanedLine = NonXmlCharsRegex.replaceAllIn(line, " ") // (Nonprinting) non-XML chars -> spaces
        if (noMoreHeaders) {
          content.append(cleanedLine).append("\n")
        } else {
          NewsgroupHeaderRegex.findFirstMatchIn(cleanedLine) match {
            case None =>
              noMoreHeaders = true
              content.append(cleanedLine).append("\n")
            case Some(fieldValue) => {
              val field = NonAlphaNumCharsRegex.replaceAllIn(fieldValue.group(1), "_")
              val value = fieldValue.group(2)
              field match {
                case "Message-ID" => doc.addField(s"${field}_s", value.trim)
                case "From" | "Subject" | "Sender" => doc.addField(s"${field}_txt_en", value.trim)
                case "Newsgroups" => value.split(",").map(_.trim).filter(_.length > 0)
                  .foreach(newsgroup => doc.addField(s"${field}_ss", newsgroup))
                case "Date" => // 2 fields: original date text, and reformatted as ISO-8601
                  val trimmedValue = value.trim
                  doc.addField(s"${field}_s", trimmedValue)
                  DateConverter.toISO8601(trimmedValue).foreach(doc.addField(s"${field}_tdt", _))
                case _ => doc.addField(s"${field}_txt", value)
              }
            }
          }
        }
      }
      doc.addField("content_txt_en", content.toString())
    } finally {
      inputStream.close()
    }
    doc
  }
}
object NewsgroupsIndexer {
  val DefaultZkHost = "localhost:9983"
  val DefaultBatchSize = "100"
  val DefaultCollection = "ml20news"
  val NonXmlCharsRegex = "[\u0000-\u0008\u000B\u000C\u000E-\u001F]".r
  val NewsgroupHeaderRegex = "^([^: \t]+):[ \t]*(.*)".r
  val NonAlphaNumCharsRegex = "[^_A-Za-z0-9]".r
}

/** Converts 3-letter time zone IDs to IDs that Joda-Time understands, parses dates using
  * a set of date formats known to be present in the 20 newsgroups data, then converts them
  * to ISO8601 format.
  */
object DateConverter extends Serializable with LazyLogging {
  // Map of time zone abbreviations to time zone IDs, from ,
  // , and Joda-Time v2.2 DateTimeZone.getAvailableIds()
  val ZoneMap = Map("+3000" -> "Europe/Moscow", // 19 Apr 93 16:15:19 +3000 <- invalid offset, should be +0300
    "ACST" -> "Australia/Adelaide", "BST" -> "Europe/London", "CDT" -> "America/Chicago",
    "CET" -> "Europe/Brussels", "CST" -> "America/Chicago", "ECT" -> "America/Guayaquil",
    "EDT" -> "America/New_York", "EST" -> "America/New_York", "GMT" -> "Etc/GMT",
    "GMT+12" -> "Etc/GMT+12", "IDT" -> "Asia/Jerusalem", "KST" -> "Asia/Seoul",
    "MDT" -> "America/Denver", "MET" -> "Europe/Berlin", "MEZ" -> "Europe/Berlin",
    "MST" -> "America/Denver", "NZDT" -> "Pacific/Auckland", "NZST" -> "Pacific/Auckland",
    "PDT" -> "America/Los_Angeles", "PST" -> "America/Los_Angeles", "TUR" -> "Asia/Istanbul",
    "UT" -> "Etc/UTC", "UTC" -> "Etc/UTC")
    .map(e => e._1 -> DateTimeZone.forID(e._2))
  // Below can't be triple quoted; interpolated raw strings and escapes don't mix: https://issues.scala-lang.org/browse/SI-6476
  val ZonesRegex = s"\\s+\\(?((?i)${ZoneMap.keys.map(z => s"\\Q$z\\E").mkString("|")})\\)?$$".r
  val DateParsers = Array("dd MMM yy",
    "dd MMM yy HH:mm",
    "dd MMM yy HH:mm Z",  // Z is for offsets, e.g. +0200, -0400
    "dd MMM yy HH:mm:ss",
    "dd MMM yy HH:mm:ss Z",
    "MM/dd/yy",
    "MMM dd, yy",
    "yyyy-MM-dd HH:mm:ss",
    "MMM dd HH:mm:ss yy").map(DateTimeFormat.forPattern(_).getParser)
  val Formatter = new DateTimeFormatterBuilder().append(null, DateParsers)
    .toFormatter.withPivotYear(1970).withLocale(Locale.ENGLISH)
  val MultiSpaceRegex = """\s{2,}""".r
  val TrailingOffsetRegex = """\s*\([-+]\d{4}\)$""".r  // Sun, 18 Apr 93 13:35:23 EDT(-0400)
  val DayOfWeekPattern = "(?i:Sun|Mon|Tue(?:s)?|Wed(?:nes)?|Thu(?:rs)?|Fri|Sat(?:ur)?)(?i:day)?"
  val DowRegex = s"$DayOfWeekPattern,?\\s*|\\s*\\($DayOfWeekPattern\\)$$".r
  def toISO8601(date: String): Option[String] = {
    try {
      var zone = DateTimeZone.UTC
      val dateSingleSpaced = MultiSpaceRegex.replaceAllIn(date, " ")
      val dateNoExtraTrailingOffset = TrailingOffsetRegex.replaceFirstIn(dateSingleSpaced, "")
      val dateNoDow = DowRegex.replaceFirstIn(dateNoExtraTrailingOffset, "")
      val dateNoZone = ZonesRegex.replaceAllIn(dateNoDow, m => {
        zone = ZoneMap(m.group(1).toUpperCase(Locale.ROOT))
        ""}) // remove time zone abbreviations
      Some(Formatter.withZone(zone).parseDateTime(dateNoZone).toString(ISODateTimeFormat.dateTimeNoMillis()))
    } catch {
      case NonFatal(e) => logger.error(s"Failed to parse date '$date': $e")
        None
    }
  }
}